├── README.md ├── v4.x ├── uksm-4.0.patch ├── uksm-4.1.patch ├── uksm-4.10.patch ├── uksm-4.11.patch ├── uksm-4.12.patch ├── uksm-4.13.patch ├── uksm-4.14.patch ├── uksm-4.15.patch ├── uksm-4.16.patch ├── uksm-4.17.patch ├── uksm-4.18.patch ├── uksm-4.19.patch ├── uksm-4.2.patch ├── uksm-4.20.patch ├── uksm-4.3.patch ├── uksm-4.4.patch ├── uksm-4.5.patch ├── uksm-4.6.patch ├── uksm-4.7.patch ├── uksm-4.8.patch └── uksm-4.9.patch └── v5.x ├── uksm-5.0.patch ├── uksm-5.1.patch ├── uksm-5.10.patch ├── uksm-5.11.patch ├── uksm-5.12.patch ├── uksm-5.13.patch ├── uksm-5.14.patch ├── uksm-5.15.patch ├── uksm-5.16.patch ├── uksm-5.17.patch ├── uksm-5.2.patch ├── uksm-5.3.patch ├── uksm-5.4.patch ├── uksm-5.5.patch ├── uksm-5.6.patch ├── uksm-5.7.patch ├── uksm-5.8.patch └── uksm-5.9.patch /README.md: -------------------------------------------------------------------------------- 1 | # UKSM 2 | ---------------------------------------------------- 3 | The patches in this repo are the latest UKSM patches 4 | 5 | The current release number: 0.1.2.6 6 | 7 | This release includes two bug fixes from Huawei, many thanks to their engineers and esepcially to @colo-ft who submitted the patches. 8 | 9 | Changelog for all versions is in Documentation/vm/uksm.txt. 10 | 11 | # What is it? 12 | 13 | The Ultra Kernel Samepage Merging feature 14 | ---------------------------------------------- 15 | 16 | Ultra KSM. Copyright (C) 2011-2016 Nai Xia 17 | 18 | This is an improvement upon KSM. Some basic data structures and routines 19 | are borrowed from ksm.c . 20 | 21 | Its new features: 22 | 23 | 1. Full system scan: 24 | It automatically scans all user processes' anonymous VMAs. Kernel-user 25 | interaction to submit a memory area to KSM is no longer needed. 26 | 27 | 2. Rich area detection: 28 | It automatically detects rich areas containing abundant duplicated 29 | pages based. Rich areas are given a full scan speed. Poor areas are 30 | sampled at a reasonable speed with very low CPU consumption. 31 | 32 | 3. Ultra Per-page scan speed improvement: 33 | A new hash algorithm is proposed. As a result, on a machine with 34 | Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it 35 | can scan memory areas that does not contain duplicated pages at speed of 36 | 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of 37 | 477MB/sec ~ 923MB/sec. 38 | 39 | 4. Thrashing area avoidance: 40 | Thrashing area(an VMA that has frequent Ksm page break-out) can be 41 | filtered out. My benchmark shows it's more efficient than KSM's per-page 42 | hash value based volatile page detection. 43 | 44 | 45 | 5. Misc changes upon KSM: 46 | * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page 47 | comparison. It's much faster than default C version on x86. 48 | * rmap_item now has an struct *page member to loosely cache a 49 | address-->page mapping, which reduces too much time-costly 50 | follow_page(). 51 | * The VMA creation/exit procedures are hooked to let the Ultra KSM know. 52 | * try_to_merge_two_pages() now can revert a pte if it fails. No break_ 53 | ksm is needed for this case. 54 | 55 | 6. Full Zero Page consideration(contributed by Figo Zhang) 56 | Now uksmd consider full zero pages as special pages and merge them to an 57 | special unswappable uksm zero page. 58 | 59 | # Credits 60 | 61 | Ultra KSM. Copyright (C) 2011-2016 Nai Xia 62 | 63 | 64 | # Reference 65 | 66 | [FAST '18] UKSM: Swift Memory Deduplication via Hierarchical and Adaptive Memory Region Distilling [[PDF](https://www.usenix.org/system/files/conference/fast18/fast18-xia.pdf)] [[Slides](https://www.usenix.org/sites/default/files/conference/protected-files/fast18_slides_xia.pdf)] 67 | -------------------------------------------------------------------------------- /v4.x/uksm-4.7.patch: -------------------------------------------------------------------------------- 1 | diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX 2 | index 6a5e2a1..09eaa9a1 100644 3 | --- a/Documentation/vm/00-INDEX 4 | +++ b/Documentation/vm/00-INDEX 5 | @@ -18,6 +18,8 @@ idle_page_tracking.txt 6 | - description of the idle page tracking feature. 7 | ksm.txt 8 | - how to use the Kernel Samepage Merging feature. 9 | +uksm.txt 10 | + - Introduction to Ultra KSM 11 | numa 12 | - information about NUMA specific code in the Linux vm. 13 | numa_memory_policy.txt 14 | diff --git a/Documentation/vm/uksm.txt b/Documentation/vm/uksm.txt 15 | new file mode 100644 16 | index 0000000..b7a110f 17 | --- /dev/null 18 | +++ b/Documentation/vm/uksm.txt 19 | @@ -0,0 +1,61 @@ 20 | +The Ultra Kernel Samepage Merging feature 21 | +---------------------------------------------- 22 | +/* 23 | + * Ultra KSM. Copyright (C) 2011-2012 Nai Xia 24 | + * 25 | + * This is an improvement upon KSM. Some basic data structures and routines 26 | + * are borrowed from ksm.c . 27 | + * 28 | + * Its new features: 29 | + * 1. Full system scan: 30 | + * It automatically scans all user processes' anonymous VMAs. Kernel-user 31 | + * interaction to submit a memory area to KSM is no longer needed. 32 | + * 33 | + * 2. Rich area detection: 34 | + * It automatically detects rich areas containing abundant duplicated 35 | + * pages based. Rich areas are given a full scan speed. Poor areas are 36 | + * sampled at a reasonable speed with very low CPU consumption. 37 | + * 38 | + * 3. Ultra Per-page scan speed improvement: 39 | + * A new hash algorithm is proposed. As a result, on a machine with 40 | + * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it 41 | + * can scan memory areas that does not contain duplicated pages at speed of 42 | + * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of 43 | + * 477MB/sec ~ 923MB/sec. 44 | + * 45 | + * 4. Thrashing area avoidance: 46 | + * Thrashing area(an VMA that has frequent Ksm page break-out) can be 47 | + * filtered out. My benchmark shows it's more efficient than KSM's per-page 48 | + * hash value based volatile page detection. 49 | + * 50 | + * 51 | + * 5. Misc changes upon KSM: 52 | + * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page 53 | + * comparison. It's much faster than default C version on x86. 54 | + * * rmap_item now has an struct *page member to loosely cache a 55 | + * address-->page mapping, which reduces too much time-costly 56 | + * follow_page(). 57 | + * * The VMA creation/exit procedures are hooked to let the Ultra KSM know. 58 | + * * try_to_merge_two_pages() now can revert a pte if it fails. No break_ 59 | + * ksm is needed for this case. 60 | + * 61 | + * 6. Full Zero Page consideration(contributed by Figo Zhang) 62 | + * Now uksmd consider full zero pages as special pages and merge them to an 63 | + * special unswappable uksm zero page. 64 | + */ 65 | + 66 | +ChangeLog: 67 | + 68 | +2012-05-05 The creation of this Doc 69 | +2012-05-08 UKSM 0.1.1.1 libc crash bug fix, api clean up, doc clean up. 70 | +2012-05-28 UKSM 0.1.1.2 bug fix release 71 | +2012-06-26 UKSM 0.1.2-beta1 first beta release for 0.1.2 72 | +2012-07-2 UKSM 0.1.2-beta2 73 | +2012-07-10 UKSM 0.1.2-beta3 74 | +2012-07-26 UKSM 0.1.2 Fine grained speed control, more scan optimization. 75 | +2012-10-13 UKSM 0.1.2.1 Bug fixes. 76 | +2012-12-31 UKSM 0.1.2.2 Minor bug fixes. 77 | +2014-07-02 UKSM 0.1.2.3 Fix a " __this_cpu_read() in preemptible bug". 78 | +2015-04-22 UKSM 0.1.2.4 Fix a race condition that can sometimes trigger anonying warnings. 79 | +2016-09-10 UKSM 0.1.2.5 Fix a bug in dedup ratio calculation. 80 | +2017-02-26 UKSM 0.1.2.6 Fix a bug in hugetlbpage handling and a race bug with page migration. 81 | diff --git a/fs/exec.c b/fs/exec.c 82 | index 887c1c9..2bee16e 100644 83 | --- a/fs/exec.c 84 | +++ b/fs/exec.c 85 | @@ -19,7 +19,7 @@ 86 | * current->executable is only used by the procfs. This allows a dispatch 87 | * table to check for several different types of binary formats. We keep 88 | * trying until we recognize the file or we run out of supported binary 89 | - * formats. 90 | + * formats. 91 | */ 92 | 93 | #include 94 | @@ -57,6 +57,7 @@ 95 | #include 96 | #include 97 | #include 98 | +#include 99 | 100 | #include 101 | #include 102 | diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c 103 | index 8372046..82aa2f4 100644 104 | --- a/fs/proc/meminfo.c 105 | +++ b/fs/proc/meminfo.c 106 | @@ -89,6 +89,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) 107 | "SUnreclaim: %8lu kB\n" 108 | "KernelStack: %8lu kB\n" 109 | "PageTables: %8lu kB\n" 110 | +#ifdef CONFIG_UKSM 111 | + "KsmZeroPages: %8lu kB\n" 112 | +#endif 113 | #ifdef CONFIG_QUICKLIST 114 | "Quicklists: %8lu kB\n" 115 | #endif 116 | @@ -147,6 +150,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) 117 | K(global_page_state(NR_SLAB_UNRECLAIMABLE)), 118 | global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024, 119 | K(global_page_state(NR_PAGETABLE)), 120 | +#ifdef CONFIG_UKSM 121 | + K(global_page_state(NR_UKSM_ZERO_PAGES)), 122 | +#endif 123 | #ifdef CONFIG_QUICKLIST 124 | K(quicklist_total_size()), 125 | #endif 126 | diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h 127 | index d4458b6..172ceb9 100644 128 | --- a/include/asm-generic/pgtable.h 129 | +++ b/include/asm-generic/pgtable.h 130 | @@ -601,12 +601,25 @@ extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, 131 | extern void untrack_pfn_moved(struct vm_area_struct *vma); 132 | #endif 133 | 134 | +#ifdef CONFIG_UKSM 135 | +static inline int is_uksm_zero_pfn(unsigned long pfn) 136 | +{ 137 | + extern unsigned long uksm_zero_pfn; 138 | + return pfn == uksm_zero_pfn; 139 | +} 140 | +#else 141 | +static inline int is_uksm_zero_pfn(unsigned long pfn) 142 | +{ 143 | + return 0; 144 | +} 145 | +#endif 146 | + 147 | #ifdef __HAVE_COLOR_ZERO_PAGE 148 | static inline int is_zero_pfn(unsigned long pfn) 149 | { 150 | extern unsigned long zero_pfn; 151 | unsigned long offset_from_zero_pfn = pfn - zero_pfn; 152 | - return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT); 153 | + return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT) || is_uksm_zero_pfn(pfn); 154 | } 155 | 156 | #define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr)) 157 | @@ -615,7 +628,7 @@ static inline int is_zero_pfn(unsigned long pfn) 158 | static inline int is_zero_pfn(unsigned long pfn) 159 | { 160 | extern unsigned long zero_pfn; 161 | - return pfn == zero_pfn; 162 | + return (pfn == zero_pfn) || (is_uksm_zero_pfn(pfn)); 163 | } 164 | 165 | static inline unsigned long my_zero_pfn(unsigned long addr) 166 | diff --git a/include/linux/ksm.h b/include/linux/ksm.h 167 | index 7ae216a..06861d8 100644 168 | --- a/include/linux/ksm.h 169 | +++ b/include/linux/ksm.h 170 | @@ -19,21 +19,6 @@ struct mem_cgroup; 171 | #ifdef CONFIG_KSM 172 | int ksm_madvise(struct vm_area_struct *vma, unsigned long start, 173 | unsigned long end, int advice, unsigned long *vm_flags); 174 | -int __ksm_enter(struct mm_struct *mm); 175 | -void __ksm_exit(struct mm_struct *mm); 176 | - 177 | -static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) 178 | -{ 179 | - if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) 180 | - return __ksm_enter(mm); 181 | - return 0; 182 | -} 183 | - 184 | -static inline void ksm_exit(struct mm_struct *mm) 185 | -{ 186 | - if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) 187 | - __ksm_exit(mm); 188 | -} 189 | 190 | static inline struct stable_node *page_stable_node(struct page *page) 191 | { 192 | @@ -64,6 +49,33 @@ struct page *ksm_might_need_to_copy(struct page *page, 193 | int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); 194 | void ksm_migrate_page(struct page *newpage, struct page *oldpage); 195 | 196 | +#ifdef CONFIG_KSM_LEGACY 197 | +int __ksm_enter(struct mm_struct *mm); 198 | +void __ksm_exit(struct mm_struct *mm); 199 | +static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) 200 | +{ 201 | + if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) 202 | + return __ksm_enter(mm); 203 | + return 0; 204 | +} 205 | + 206 | +static inline void ksm_exit(struct mm_struct *mm) 207 | +{ 208 | + if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) 209 | + __ksm_exit(mm); 210 | +} 211 | + 212 | +#elif defined(CONFIG_UKSM) 213 | +static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) 214 | +{ 215 | + return 0; 216 | +} 217 | + 218 | +static inline void ksm_exit(struct mm_struct *mm) 219 | +{ 220 | +} 221 | +#endif /* !CONFIG_UKSM */ 222 | + 223 | #else /* !CONFIG_KSM */ 224 | 225 | static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) 226 | @@ -106,4 +118,6 @@ static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage) 227 | #endif /* CONFIG_MMU */ 228 | #endif /* !CONFIG_KSM */ 229 | 230 | +#include 231 | + 232 | #endif /* __LINUX_KSM_H */ 233 | diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h 234 | index ca3e517..ae62e7d1 100644 235 | --- a/include/linux/mm_types.h 236 | +++ b/include/linux/mm_types.h 237 | @@ -357,6 +357,9 @@ struct vm_area_struct { 238 | struct mempolicy *vm_policy; /* NUMA policy for the VMA */ 239 | #endif 240 | struct vm_userfaultfd_ctx vm_userfaultfd_ctx; 241 | +#ifdef CONFIG_UKSM 242 | + struct vma_slot *uksm_vma_slot; 243 | +#endif 244 | }; 245 | 246 | struct core_thread { 247 | diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h 248 | index 02069c2..f7cce50 100644 249 | --- a/include/linux/mmzone.h 250 | +++ b/include/linux/mmzone.h 251 | @@ -153,6 +153,9 @@ enum zone_stat_item { 252 | WORKINGSET_NODERECLAIM, 253 | NR_ANON_TRANSPARENT_HUGEPAGES, 254 | NR_FREE_CMA_PAGES, 255 | +#ifdef CONFIG_UKSM 256 | + NR_UKSM_ZERO_PAGES, 257 | +#endif 258 | NR_VM_ZONE_STAT_ITEMS }; 259 | 260 | /* 261 | @@ -817,7 +820,7 @@ static inline int is_highmem_idx(enum zone_type idx) 262 | } 263 | 264 | /** 265 | - * is_highmem - helper function to quickly check if a struct zone is a 266 | + * is_highmem - helper function to quickly check if a struct zone is a 267 | * highmem zone or not. This is an attempt to keep references 268 | * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum. 269 | * @zone - pointer to struct zone variable 270 | diff --git a/include/linux/sradix-tree.h b/include/linux/sradix-tree.h 271 | new file mode 100644 272 | index 0000000..6780fdb 273 | --- /dev/null 274 | +++ b/include/linux/sradix-tree.h 275 | @@ -0,0 +1,77 @@ 276 | +#ifndef _LINUX_SRADIX_TREE_H 277 | +#define _LINUX_SRADIX_TREE_H 278 | + 279 | + 280 | +#define INIT_SRADIX_TREE(root, mask) \ 281 | +do { \ 282 | + (root)->height = 0; \ 283 | + (root)->gfp_mask = (mask); \ 284 | + (root)->rnode = NULL; \ 285 | +} while (0) 286 | + 287 | +#define ULONG_BITS (sizeof(unsigned long) * 8) 288 | +#define SRADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) 289 | +//#define SRADIX_TREE_MAP_SHIFT 6 290 | +//#define SRADIX_TREE_MAP_SIZE (1UL << SRADIX_TREE_MAP_SHIFT) 291 | +//#define SRADIX_TREE_MAP_MASK (SRADIX_TREE_MAP_SIZE-1) 292 | + 293 | +struct sradix_tree_node { 294 | + unsigned int height; /* Height from the bottom */ 295 | + unsigned int count; 296 | + unsigned int fulls; /* Number of full sublevel trees */ 297 | + struct sradix_tree_node *parent; 298 | + void *stores[0]; 299 | +}; 300 | + 301 | +/* A simple radix tree implementation */ 302 | +struct sradix_tree_root { 303 | + unsigned int height; 304 | + struct sradix_tree_node *rnode; 305 | + 306 | + /* Where found to have available empty stores in its sublevels */ 307 | + struct sradix_tree_node *enter_node; 308 | + unsigned int shift; 309 | + unsigned int stores_size; 310 | + unsigned int mask; 311 | + unsigned long min; /* The first hole index */ 312 | + unsigned long num; 313 | + //unsigned long *height_to_maxindex; 314 | + 315 | + /* How the node is allocated and freed. */ 316 | + struct sradix_tree_node *(*alloc)(void); 317 | + void (*free)(struct sradix_tree_node *node); 318 | + 319 | + /* When a new node is added and removed */ 320 | + void (*extend)(struct sradix_tree_node *parent, struct sradix_tree_node *child); 321 | + void (*assign)(struct sradix_tree_node *node, unsigned index, void *item); 322 | + void (*rm)(struct sradix_tree_node *node, unsigned offset); 323 | +}; 324 | + 325 | +struct sradix_tree_path { 326 | + struct sradix_tree_node *node; 327 | + int offset; 328 | +}; 329 | + 330 | +static inline 331 | +void init_sradix_tree_root(struct sradix_tree_root *root, unsigned long shift) 332 | +{ 333 | + root->height = 0; 334 | + root->rnode = NULL; 335 | + root->shift = shift; 336 | + root->stores_size = 1UL << shift; 337 | + root->mask = root->stores_size - 1; 338 | +} 339 | + 340 | + 341 | +extern void *sradix_tree_next(struct sradix_tree_root *root, 342 | + struct sradix_tree_node *node, unsigned long index, 343 | + int (*iter)(void *, unsigned long)); 344 | + 345 | +extern int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num); 346 | + 347 | +extern void sradix_tree_delete_from_leaf(struct sradix_tree_root *root, 348 | + struct sradix_tree_node *node, unsigned long index); 349 | + 350 | +extern void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index); 351 | + 352 | +#endif /* _LINUX_SRADIX_TREE_H */ 353 | diff --git a/include/linux/uksm.h b/include/linux/uksm.h 354 | new file mode 100644 355 | index 0000000..825f05e 356 | --- /dev/null 357 | +++ b/include/linux/uksm.h 358 | @@ -0,0 +1,149 @@ 359 | +#ifndef __LINUX_UKSM_H 360 | +#define __LINUX_UKSM_H 361 | +/* 362 | + * Memory merging support. 363 | + * 364 | + * This code enables dynamic sharing of identical pages found in different 365 | + * memory areas, even if they are not shared by fork(). 366 | + */ 367 | + 368 | +/* if !CONFIG_UKSM this file should not be compiled at all. */ 369 | +#ifdef CONFIG_UKSM 370 | + 371 | +#include 372 | +#include 373 | +#include 374 | +#include 375 | +#include 376 | + 377 | +extern unsigned long zero_pfn __read_mostly; 378 | +extern unsigned long uksm_zero_pfn __read_mostly; 379 | +extern struct page *empty_uksm_zero_page; 380 | + 381 | +/* must be done before linked to mm */ 382 | +extern void uksm_vma_add_new(struct vm_area_struct *vma); 383 | +extern void uksm_remove_vma(struct vm_area_struct *vma); 384 | + 385 | +#define UKSM_SLOT_NEED_SORT (1 << 0) 386 | +#define UKSM_SLOT_NEED_RERAND (1 << 1) 387 | +#define UKSM_SLOT_SCANNED (1 << 2) /* It's scanned in this round */ 388 | +#define UKSM_SLOT_FUL_SCANNED (1 << 3) 389 | +#define UKSM_SLOT_IN_UKSM (1 << 4) 390 | + 391 | +struct vma_slot { 392 | + struct sradix_tree_node *snode; 393 | + unsigned long sindex; 394 | + 395 | + struct list_head slot_list; 396 | + unsigned long fully_scanned_round; 397 | + unsigned long dedup_num; 398 | + unsigned long pages_scanned; 399 | + unsigned long this_sampled; 400 | + unsigned long last_scanned; 401 | + unsigned long pages_to_scan; 402 | + struct scan_rung *rung; 403 | + struct page **rmap_list_pool; 404 | + unsigned int *pool_counts; 405 | + unsigned long pool_size; 406 | + struct vm_area_struct *vma; 407 | + struct mm_struct *mm; 408 | + unsigned long ctime_j; 409 | + unsigned long pages; 410 | + unsigned long flags; 411 | + unsigned long pages_cowed; /* pages cowed this round */ 412 | + unsigned long pages_merged; /* pages merged this round */ 413 | + unsigned long pages_bemerged; 414 | + 415 | + /* when it has page merged in this eval round */ 416 | + struct list_head dedup_list; 417 | +}; 418 | + 419 | +static inline void uksm_unmap_zero_page(pte_t pte) 420 | +{ 421 | + if (pte_pfn(pte) == uksm_zero_pfn) 422 | + __dec_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES); 423 | +} 424 | + 425 | +static inline void uksm_map_zero_page(pte_t pte) 426 | +{ 427 | + if (pte_pfn(pte) == uksm_zero_pfn) 428 | + __inc_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES); 429 | +} 430 | + 431 | +static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page) 432 | +{ 433 | + if (vma->uksm_vma_slot && PageKsm(page)) 434 | + vma->uksm_vma_slot->pages_cowed++; 435 | +} 436 | + 437 | +static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte) 438 | +{ 439 | + if (vma->uksm_vma_slot && pte_pfn(pte) == uksm_zero_pfn) 440 | + vma->uksm_vma_slot->pages_cowed++; 441 | +} 442 | + 443 | +static inline int uksm_flags_can_scan(unsigned long vm_flags) 444 | +{ 445 | +#ifdef VM_SAO 446 | + if (vm_flags & VM_SAO) 447 | + return 0; 448 | +#endif 449 | + 450 | + return !(vm_flags & (VM_PFNMAP | VM_IO | VM_DONTEXPAND | 451 | + VM_HUGETLB | VM_MIXEDMAP | VM_SHARED 452 | + | VM_MAYSHARE | VM_GROWSUP | VM_GROWSDOWN)); 453 | +} 454 | + 455 | +static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p) 456 | +{ 457 | + if (uksm_flags_can_scan(*vm_flags_p)) 458 | + *vm_flags_p |= VM_MERGEABLE; 459 | +} 460 | + 461 | +/* 462 | + * Just a wrapper for BUG_ON for where ksm_zeropage must not be. TODO: it will 463 | + * be removed when uksm zero page patch is stable enough. 464 | + */ 465 | +static inline void uksm_bugon_zeropage(pte_t pte) 466 | +{ 467 | + BUG_ON(pte_pfn(pte) == uksm_zero_pfn); 468 | +} 469 | +#else 470 | +static inline void uksm_vma_add_new(struct vm_area_struct *vma) 471 | +{ 472 | +} 473 | + 474 | +static inline void uksm_remove_vma(struct vm_area_struct *vma) 475 | +{ 476 | +} 477 | + 478 | +static inline void uksm_unmap_zero_page(pte_t pte) 479 | +{ 480 | +} 481 | + 482 | +static inline void uksm_map_zero_page(pte_t pte) 483 | +{ 484 | +} 485 | + 486 | +static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page) 487 | +{ 488 | +} 489 | + 490 | +static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte) 491 | +{ 492 | +} 493 | + 494 | +static inline int uksm_flags_can_scan(unsigned long vm_flags) 495 | +{ 496 | + return 0; 497 | +} 498 | + 499 | +static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p) 500 | +{ 501 | +} 502 | + 503 | +static inline void uksm_bugon_zeropage(pte_t pte) 504 | +{ 505 | +} 506 | +#endif /* !CONFIG_UKSM */ 507 | +#endif /* __LINUX_UKSM_H */ 508 | diff --git a/kernel/fork.c b/kernel/fork.c 509 | index d6404ed..4ce26c0 100644 510 | --- a/kernel/fork.c 511 | +++ b/kernel/fork.c 512 | @@ -459,7 +459,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) 513 | goto fail_nomem; 514 | charge = len; 515 | } 516 | - tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); 517 | + tmp = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); 518 | if (!tmp) 519 | goto fail_nomem; 520 | *tmp = *mpnt; 521 | @@ -512,7 +512,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) 522 | __vma_link_rb(mm, tmp, rb_link, rb_parent); 523 | rb_link = &tmp->vm_rb.rb_right; 524 | rb_parent = &tmp->vm_rb; 525 | - 526 | + uksm_vma_add_new(tmp); 527 | mm->map_count++; 528 | retval = copy_page_range(mm, oldmm, mpnt); 529 | 530 | diff --git a/lib/Makefile b/lib/Makefile 531 | index ff6a7a6..ac0bb55 100644 532 | --- a/lib/Makefile 533 | +++ b/lib/Makefile 534 | @@ -20,7 +20,7 @@ KCOV_INSTRUMENT_dynamic_debug.o := n 535 | KCOV_INSTRUMENT_hweight.o := n 536 | 537 | lib-y := ctype.o string.o vsprintf.o cmdline.o \ 538 | - rbtree.o radix-tree.o dump_stack.o timerqueue.o\ 539 | + rbtree.o radix-tree.o sradix-tree.o dump_stack.o timerqueue.o\ 540 | idr.o int_sqrt.o extable.o \ 541 | sha1.o md5.o irq_regs.o argv_split.o \ 542 | flex_proportions.o ratelimit.o show_mem.o \ 543 | diff --git a/lib/sradix-tree.c b/lib/sradix-tree.c 544 | new file mode 100644 545 | index 0000000..8d06329 546 | --- /dev/null 547 | +++ b/lib/sradix-tree.c 548 | @@ -0,0 +1,476 @@ 549 | +#include 550 | +#include 551 | +#include 552 | +#include 553 | +#include 554 | +#include 555 | +#include 556 | + 557 | +static inline int sradix_node_full(struct sradix_tree_root *root, struct sradix_tree_node *node) 558 | +{ 559 | + return node->fulls == root->stores_size || 560 | + (node->height == 1 && node->count == root->stores_size); 561 | +} 562 | + 563 | +/* 564 | + * Extend a sradix tree so it can store key @index. 565 | + */ 566 | +static int sradix_tree_extend(struct sradix_tree_root *root, unsigned long index) 567 | +{ 568 | + struct sradix_tree_node *node; 569 | + unsigned int height; 570 | + 571 | + if (unlikely(root->rnode == NULL)) { 572 | + if (!(node = root->alloc())) 573 | + return -ENOMEM; 574 | + 575 | + node->height = 1; 576 | + root->rnode = node; 577 | + root->height = 1; 578 | + } 579 | + 580 | + /* Figure out what the height should be. */ 581 | + height = root->height; 582 | + index >>= root->shift * height; 583 | + 584 | + while (index) { 585 | + index >>= root->shift; 586 | + height++; 587 | + } 588 | + 589 | + while (height > root->height) { 590 | + unsigned int newheight; 591 | + if (!(node = root->alloc())) 592 | + return -ENOMEM; 593 | + 594 | + /* Increase the height. */ 595 | + node->stores[0] = root->rnode; 596 | + root->rnode->parent = node; 597 | + if (root->extend) 598 | + root->extend(node, root->rnode); 599 | + 600 | + newheight = root->height + 1; 601 | + node->height = newheight; 602 | + node->count = 1; 603 | + if (sradix_node_full(root, root->rnode)) 604 | + node->fulls = 1; 605 | + 606 | + root->rnode = node; 607 | + root->height = newheight; 608 | + } 609 | + 610 | + return 0; 611 | +} 612 | + 613 | +/* 614 | + * Search the next item from the current node, that is not NULL 615 | + * and can satify root->iter(). 616 | + */ 617 | +void *sradix_tree_next(struct sradix_tree_root *root, 618 | + struct sradix_tree_node *node, unsigned long index, 619 | + int (*iter)(void *item, unsigned long height)) 620 | +{ 621 | + unsigned long offset; 622 | + void *item; 623 | + 624 | + if (unlikely(node == NULL)) { 625 | + node = root->rnode; 626 | + for (offset = 0; offset < root->stores_size; offset++) { 627 | + item = node->stores[offset]; 628 | + if (item && (!iter || iter(item, node->height))) 629 | + break; 630 | + } 631 | + 632 | + if (unlikely(offset >= root->stores_size)) 633 | + return NULL; 634 | + 635 | + if (node->height == 1) 636 | + return item; 637 | + else 638 | + goto go_down; 639 | + } 640 | + 641 | + while (node) { 642 | + offset = (index & root->mask) + 1; 643 | + for (;offset < root->stores_size; offset++) { 644 | + item = node->stores[offset]; 645 | + if (item && (!iter || iter(item, node->height))) 646 | + break; 647 | + } 648 | + 649 | + if (offset < root->stores_size) 650 | + break; 651 | + 652 | + node = node->parent; 653 | + index >>= root->shift; 654 | + } 655 | + 656 | + if (!node) 657 | + return NULL; 658 | + 659 | + while (node->height > 1) { 660 | +go_down: 661 | + node = item; 662 | + for (offset = 0; offset < root->stores_size; offset++) { 663 | + item = node->stores[offset]; 664 | + if (item && (!iter || iter(item, node->height))) 665 | + break; 666 | + } 667 | + 668 | + if (unlikely(offset >= root->stores_size)) 669 | + return NULL; 670 | + } 671 | + 672 | + BUG_ON(offset > root->stores_size); 673 | + 674 | + return item; 675 | +} 676 | + 677 | +/* 678 | + * Blindly insert the item to the tree. Typically, we reuse the 679 | + * first empty store item. 680 | + */ 681 | +int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num) 682 | +{ 683 | + unsigned long index; 684 | + unsigned int height; 685 | + struct sradix_tree_node *node, *tmp = NULL; 686 | + int offset, offset_saved; 687 | + void **store = NULL; 688 | + int error, i, j, shift; 689 | + 690 | +go_on: 691 | + index = root->min; 692 | + 693 | + if (root->enter_node && !sradix_node_full(root, root->enter_node)) { 694 | + node = root->enter_node; 695 | + BUG_ON((index >> (root->shift * root->height))); 696 | + } else { 697 | + node = root->rnode; 698 | + if (node == NULL || (index >> (root->shift * root->height)) 699 | + || sradix_node_full(root, node)) { 700 | + error = sradix_tree_extend(root, index); 701 | + if (error) 702 | + return error; 703 | + 704 | + node = root->rnode; 705 | + } 706 | + } 707 | + 708 | + 709 | + height = node->height; 710 | + shift = (height - 1) * root->shift; 711 | + offset = (index >> shift) & root->mask; 712 | + while (shift > 0) { 713 | + offset_saved = offset; 714 | + for (; offset < root->stores_size; offset++) { 715 | + store = &node->stores[offset]; 716 | + tmp = *store; 717 | + 718 | + if (!tmp || !sradix_node_full(root, tmp)) 719 | + break; 720 | + } 721 | + BUG_ON(offset >= root->stores_size); 722 | + 723 | + if (offset != offset_saved) { 724 | + index += (offset - offset_saved) << shift; 725 | + index &= ~((1UL << shift) - 1); 726 | + } 727 | + 728 | + if (!tmp) { 729 | + if (!(tmp = root->alloc())) 730 | + return -ENOMEM; 731 | + 732 | + tmp->height = shift / root->shift; 733 | + *store = tmp; 734 | + tmp->parent = node; 735 | + node->count++; 736 | +// if (root->extend) 737 | +// root->extend(node, tmp); 738 | + } 739 | + 740 | + node = tmp; 741 | + shift -= root->shift; 742 | + offset = (index >> shift) & root->mask; 743 | + } 744 | + 745 | + BUG_ON(node->height != 1); 746 | + 747 | + 748 | + store = &node->stores[offset]; 749 | + for (i = 0, j = 0; 750 | + j < root->stores_size - node->count && 751 | + i < root->stores_size - offset && j < num; i++) { 752 | + if (!store[i]) { 753 | + store[i] = item[j]; 754 | + if (root->assign) 755 | + root->assign(node, index + i, item[j]); 756 | + j++; 757 | + } 758 | + } 759 | + 760 | + node->count += j; 761 | + root->num += j; 762 | + num -= j; 763 | + 764 | + while (sradix_node_full(root, node)) { 765 | + node = node->parent; 766 | + if (!node) 767 | + break; 768 | + 769 | + node->fulls++; 770 | + } 771 | + 772 | + if (unlikely(!node)) { 773 | + /* All nodes are full */ 774 | + root->min = 1 << (root->height * root->shift); 775 | + root->enter_node = NULL; 776 | + } else { 777 | + root->min = index + i - 1; 778 | + root->min |= (1UL << (node->height - 1)) - 1; 779 | + root->min++; 780 | + root->enter_node = node; 781 | + } 782 | + 783 | + if (num) { 784 | + item += j; 785 | + goto go_on; 786 | + } 787 | + 788 | + return 0; 789 | +} 790 | + 791 | + 792 | +/** 793 | + * sradix_tree_shrink - shrink height of a sradix tree to minimal 794 | + * @root sradix tree root 795 | + * 796 | + */ 797 | +static inline void sradix_tree_shrink(struct sradix_tree_root *root) 798 | +{ 799 | + /* try to shrink tree height */ 800 | + while (root->height > 1) { 801 | + struct sradix_tree_node *to_free = root->rnode; 802 | + 803 | + /* 804 | + * The candidate node has more than one child, or its child 805 | + * is not at the leftmost store, we cannot shrink. 806 | + */ 807 | + if (to_free->count != 1 || !to_free->stores[0]) 808 | + break; 809 | + 810 | + root->rnode = to_free->stores[0]; 811 | + root->rnode->parent = NULL; 812 | + root->height--; 813 | + if (unlikely(root->enter_node == to_free)) { 814 | + root->enter_node = NULL; 815 | + } 816 | + root->free(to_free); 817 | + } 818 | +} 819 | + 820 | +/* 821 | + * Del the item on the known leaf node and index 822 | + */ 823 | +void sradix_tree_delete_from_leaf(struct sradix_tree_root *root, 824 | + struct sradix_tree_node *node, unsigned long index) 825 | +{ 826 | + unsigned int offset; 827 | + struct sradix_tree_node *start, *end; 828 | + 829 | + BUG_ON(node->height != 1); 830 | + 831 | + start = node; 832 | + while (node && !(--node->count)) 833 | + node = node->parent; 834 | + 835 | + end = node; 836 | + if (!node) { 837 | + root->rnode = NULL; 838 | + root->height = 0; 839 | + root->min = 0; 840 | + root->num = 0; 841 | + root->enter_node = NULL; 842 | + } else { 843 | + offset = (index >> (root->shift * (node->height - 1))) & root->mask; 844 | + if (root->rm) 845 | + root->rm(node, offset); 846 | + node->stores[offset] = NULL; 847 | + root->num--; 848 | + if (root->min > index) { 849 | + root->min = index; 850 | + root->enter_node = node; 851 | + } 852 | + } 853 | + 854 | + if (start != end) { 855 | + do { 856 | + node = start; 857 | + start = start->parent; 858 | + if (unlikely(root->enter_node == node)) 859 | + root->enter_node = end; 860 | + root->free(node); 861 | + } while (start != end); 862 | + 863 | + /* 864 | + * Note that shrink may free "end", so enter_node still need to 865 | + * be checked inside. 866 | + */ 867 | + sradix_tree_shrink(root); 868 | + } else if (node->count == root->stores_size - 1) { 869 | + /* It WAS a full leaf node. Update the ancestors */ 870 | + node = node->parent; 871 | + while (node) { 872 | + node->fulls--; 873 | + if (node->fulls != root->stores_size - 1) 874 | + break; 875 | + 876 | + node = node->parent; 877 | + } 878 | + } 879 | +} 880 | + 881 | +void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index) 882 | +{ 883 | + unsigned int height, offset; 884 | + struct sradix_tree_node *node; 885 | + int shift; 886 | + 887 | + node = root->rnode; 888 | + if (node == NULL || (index >> (root->shift * root->height))) 889 | + return NULL; 890 | + 891 | + height = root->height; 892 | + shift = (height - 1) * root->shift; 893 | + 894 | + do { 895 | + offset = (index >> shift) & root->mask; 896 | + node = node->stores[offset]; 897 | + if (!node) 898 | + return NULL; 899 | + 900 | + shift -= root->shift; 901 | + } while (shift >= 0); 902 | + 903 | + return node; 904 | +} 905 | + 906 | +/* 907 | + * Return the item if it exists, otherwise create it in place 908 | + * and return the created item. 909 | + */ 910 | +void *sradix_tree_lookup_create(struct sradix_tree_root *root, 911 | + unsigned long index, void *(*item_alloc)(void)) 912 | +{ 913 | + unsigned int height, offset; 914 | + struct sradix_tree_node *node, *tmp; 915 | + void *item; 916 | + int shift, error; 917 | + 918 | + if (root->rnode == NULL || (index >> (root->shift * root->height))) { 919 | + if (item_alloc) { 920 | + error = sradix_tree_extend(root, index); 921 | + if (error) 922 | + return NULL; 923 | + } else { 924 | + return NULL; 925 | + } 926 | + } 927 | + 928 | + node = root->rnode; 929 | + height = root->height; 930 | + shift = (height - 1) * root->shift; 931 | + 932 | + do { 933 | + offset = (index >> shift) & root->mask; 934 | + if (!node->stores[offset]) { 935 | + if (!(tmp = root->alloc())) 936 | + return NULL; 937 | + 938 | + tmp->height = shift / root->shift; 939 | + node->stores[offset] = tmp; 940 | + tmp->parent = node; 941 | + node->count++; 942 | + node = tmp; 943 | + } else { 944 | + node = node->stores[offset]; 945 | + } 946 | + 947 | + shift -= root->shift; 948 | + } while (shift > 0); 949 | + 950 | + BUG_ON(node->height != 1); 951 | + offset = index & root->mask; 952 | + if (node->stores[offset]) { 953 | + return node->stores[offset]; 954 | + } else if (item_alloc) { 955 | + if (!(item = item_alloc())) 956 | + return NULL; 957 | + 958 | + node->stores[offset] = item; 959 | + 960 | + /* 961 | + * NOTE: we do NOT call root->assign here, since this item is 962 | + * newly created by us having no meaning. Caller can call this 963 | + * if it's necessary to do so. 964 | + */ 965 | + 966 | + node->count++; 967 | + root->num++; 968 | + 969 | + while (sradix_node_full(root, node)) { 970 | + node = node->parent; 971 | + if (!node) 972 | + break; 973 | + 974 | + node->fulls++; 975 | + } 976 | + 977 | + if (unlikely(!node)) { 978 | + /* All nodes are full */ 979 | + root->min = 1 << (root->height * root->shift); 980 | + } else { 981 | + if (root->min == index) { 982 | + root->min |= (1UL << (node->height - 1)) - 1; 983 | + root->min++; 984 | + root->enter_node = node; 985 | + } 986 | + } 987 | + 988 | + return item; 989 | + } else { 990 | + return NULL; 991 | + } 992 | + 993 | +} 994 | + 995 | +int sradix_tree_delete(struct sradix_tree_root *root, unsigned long index) 996 | +{ 997 | + unsigned int height, offset; 998 | + struct sradix_tree_node *node; 999 | + int shift; 1000 | + 1001 | + node = root->rnode; 1002 | + if (node == NULL || (index >> (root->shift * root->height))) 1003 | + return -ENOENT; 1004 | + 1005 | + height = root->height; 1006 | + shift = (height - 1) * root->shift; 1007 | + 1008 | + do { 1009 | + offset = (index >> shift) & root->mask; 1010 | + node = node->stores[offset]; 1011 | + if (!node) 1012 | + return -ENOENT; 1013 | + 1014 | + shift -= root->shift; 1015 | + } while (shift > 0); 1016 | + 1017 | + offset = index & root->mask; 1018 | + if (!node->stores[offset]) 1019 | + return -ENOENT; 1020 | + 1021 | + sradix_tree_delete_from_leaf(root, node, index); 1022 | + 1023 | + return 0; 1024 | +} 1025 | diff --git a/mm/Kconfig b/mm/Kconfig 1026 | index 3e2daef..165b60e 100644 1027 | --- a/mm/Kconfig 1028 | +++ b/mm/Kconfig 1029 | @@ -332,6 +332,32 @@ config KSM 1030 | See Documentation/vm/ksm.txt for more information: KSM is inactive 1031 | until a program has madvised that an area is MADV_MERGEABLE, and 1032 | root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set). 1033 | +choice 1034 | + prompt "Choose UKSM/KSM strategy" 1035 | + default UKSM 1036 | + depends on KSM 1037 | + help 1038 | + This option allows to select a UKSM/KSM stragety. 1039 | + 1040 | +config UKSM 1041 | + bool "Ultra-KSM for page merging" 1042 | + depends on KSM 1043 | + help 1044 | + UKSM is inspired by the Linux kernel project \u2014 KSM(Kernel Same 1045 | + page Merging), but with a fundamentally rewritten core algorithm. With 1046 | + an advanced algorithm, UKSM now can transparently scans all anonymously 1047 | + mapped user space applications with an significantly improved scan speed 1048 | + and CPU efficiency. Since KVM is friendly to KSM, KVM can also benefit from 1049 | + UKSM. Now UKSM has its first stable release and first real world enterprise user. 1050 | + For more information, please goto its project page. 1051 | + (github.com/dolohow/uksm) 1052 | + 1053 | +config KSM_LEGACY 1054 | + bool "Legacy KSM implementation" 1055 | + depends on KSM 1056 | + help 1057 | + The legacy KSM implementation from Redhat. 1058 | +endchoice 1059 | 1060 | config DEFAULT_MMAP_MIN_ADDR 1061 | int "Low address space to protect from user allocation" 1062 | diff --git a/mm/Makefile b/mm/Makefile 1063 | index 78c6f7d..7e7cd8a 100644 1064 | --- a/mm/Makefile 1065 | +++ b/mm/Makefile 1066 | @@ -63,7 +63,8 @@ obj-$(CONFIG_SPARSEMEM) += sparse.o 1067 | obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o 1068 | obj-$(CONFIG_SLOB) += slob.o 1069 | obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o 1070 | -obj-$(CONFIG_KSM) += ksm.o 1071 | +obj-$(CONFIG_KSM_LEGACY) += ksm.o 1072 | +obj-$(CONFIG_UKSM) += uksm.o 1073 | obj-$(CONFIG_PAGE_POISONING) += page_poison.o 1074 | obj-$(CONFIG_SLAB) += slab.o 1075 | obj-$(CONFIG_SLUB) += slub.o 1076 | diff --git a/mm/memory.c b/mm/memory.c 1077 | index 9e04681..02200d3 100644 1078 | --- a/mm/memory.c 1079 | +++ b/mm/memory.c 1080 | @@ -124,6 +124,28 @@ unsigned long highest_memmap_pfn __read_mostly; 1081 | 1082 | EXPORT_SYMBOL(zero_pfn); 1083 | 1084 | +#ifdef CONFIG_UKSM 1085 | +unsigned long uksm_zero_pfn __read_mostly; 1086 | +EXPORT_SYMBOL_GPL(uksm_zero_pfn); 1087 | +struct page *empty_uksm_zero_page; 1088 | + 1089 | +static int __init setup_uksm_zero_page(void) 1090 | +{ 1091 | + unsigned long addr; 1092 | + addr = __get_free_pages(GFP_KERNEL | __GFP_ZERO, 0); 1093 | + if (!addr) 1094 | + panic("Oh boy, that early out of memory?"); 1095 | + 1096 | + empty_uksm_zero_page = virt_to_page((void *) addr); 1097 | + SetPageReserved(empty_uksm_zero_page); 1098 | + 1099 | + uksm_zero_pfn = page_to_pfn(empty_uksm_zero_page); 1100 | + 1101 | + return 0; 1102 | +} 1103 | +core_initcall(setup_uksm_zero_page); 1104 | +#endif 1105 | + 1106 | /* 1107 | * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() 1108 | */ 1109 | @@ -135,6 +157,7 @@ static int __init init_zero_pfn(void) 1110 | core_initcall(init_zero_pfn); 1111 | 1112 | 1113 | + 1114 | #if defined(SPLIT_RSS_COUNTING) 1115 | 1116 | void sync_mm_rss(struct mm_struct *mm) 1117 | @@ -905,6 +928,11 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, 1118 | get_page(page); 1119 | page_dup_rmap(page, false); 1120 | rss[mm_counter(page)]++; 1121 | + 1122 | + /* Should return NULL in vm_normal_page() */ 1123 | + uksm_bugon_zeropage(pte); 1124 | + } else { 1125 | + uksm_map_zero_page(pte); 1126 | } 1127 | 1128 | out_set_pte: 1129 | @@ -1138,8 +1166,10 @@ again: 1130 | ptent = ptep_get_and_clear_full(mm, addr, pte, 1131 | tlb->fullmm); 1132 | tlb_remove_tlb_entry(tlb, pte, addr); 1133 | - if (unlikely(!page)) 1134 | + if (unlikely(!page)) { 1135 | + uksm_unmap_zero_page(ptent); 1136 | continue; 1137 | + } 1138 | 1139 | if (!PageAnon(page)) { 1140 | if (pte_dirty(ptent)) { 1141 | @@ -1995,8 +2025,10 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo 1142 | clear_page(kaddr); 1143 | kunmap_atomic(kaddr); 1144 | flush_dcache_page(dst); 1145 | - } else 1146 | + } else { 1147 | copy_user_highpage(dst, src, va, vma); 1148 | + uksm_cow_page(vma, src); 1149 | + } 1150 | } 1151 | 1152 | static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma) 1153 | @@ -2141,6 +2173,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, 1154 | new_page = alloc_zeroed_user_highpage_movable(vma, address); 1155 | if (!new_page) 1156 | goto oom; 1157 | + uksm_cow_pte(vma, orig_pte); 1158 | } else { 1159 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 1160 | if (!new_page) 1161 | @@ -2166,7 +2199,9 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, 1162 | mm_counter_file(old_page)); 1163 | inc_mm_counter_fast(mm, MM_ANONPAGES); 1164 | } 1165 | + uksm_bugon_zeropage(orig_pte); 1166 | } else { 1167 | + uksm_unmap_zero_page(orig_pte); 1168 | inc_mm_counter_fast(mm, MM_ANONPAGES); 1169 | } 1170 | flush_cache_page(vma, address, pte_pfn(orig_pte)); 1171 | diff --git a/mm/mmap.c b/mm/mmap.c 1172 | index de2c176..ce60715 100644 1173 | --- a/mm/mmap.c 1174 | +++ b/mm/mmap.c 1175 | @@ -43,6 +43,7 @@ 1176 | #include 1177 | #include 1178 | #include 1179 | +#include 1180 | 1181 | #include 1182 | #include 1183 | @@ -164,6 +165,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) 1184 | if (vma->vm_file) 1185 | fput(vma->vm_file); 1186 | mpol_put(vma_policy(vma)); 1187 | + uksm_remove_vma(vma); 1188 | kmem_cache_free(vm_area_cachep, vma); 1189 | return next; 1190 | } 1191 | @@ -629,9 +631,16 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, 1192 | long adjust_next = 0; 1193 | int remove_next = 0; 1194 | 1195 | +/* 1196 | + * to avoid deadlock, ksm_remove_vma must be done before any spin_lock is 1197 | + * acquired 1198 | + */ 1199 | + uksm_remove_vma(vma); 1200 | + 1201 | if (next && !insert) { 1202 | struct vm_area_struct *exporter = NULL; 1203 | 1204 | + uksm_remove_vma(next); 1205 | if (end >= next->vm_end) { 1206 | /* 1207 | * vma expands, overlapping all the next, and 1208 | @@ -725,6 +734,7 @@ again: remove_next = 1 + (end > next->vm_end); 1209 | end_changed = true; 1210 | } 1211 | vma->vm_pgoff = pgoff; 1212 | + 1213 | if (adjust_next) { 1214 | next->vm_start += adjust_next << PAGE_SHIFT; 1215 | next->vm_pgoff += adjust_next; 1216 | @@ -795,16 +805,22 @@ again: remove_next = 1 + (end > next->vm_end); 1217 | * up the code too much to do both in one go. 1218 | */ 1219 | next = vma->vm_next; 1220 | - if (remove_next == 2) 1221 | + if (remove_next == 2) { 1222 | + uksm_remove_vma(next); 1223 | goto again; 1224 | - else if (next) 1225 | + } else if (next) { 1226 | vma_gap_update(next); 1227 | - else 1228 | + } else { 1229 | mm->highest_vm_end = end; 1230 | + } 1231 | + } else { 1232 | + if (next && !insert) 1233 | + uksm_vma_add_new(next); 1234 | } 1235 | if (insert && file) 1236 | uprobe_mmap(insert); 1237 | 1238 | + uksm_vma_add_new(vma); 1239 | validate_mm(mm); 1240 | 1241 | return 0; 1242 | @@ -1196,6 +1212,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr, 1243 | vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) | 1244 | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; 1245 | 1246 | + /* If uksm is enabled, we add VM_MERGABLE to new VMAs. */ 1247 | + uksm_vm_flags_mod(&vm_flags); 1248 | + 1249 | if (flags & MAP_LOCKED) 1250 | if (!can_do_mlock()) 1251 | return -EPERM; 1252 | @@ -1534,6 +1553,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, 1253 | allow_write_access(file); 1254 | } 1255 | file = vma->vm_file; 1256 | + uksm_vma_add_new(vma); 1257 | out: 1258 | perf_event_mmap(vma); 1259 | 1260 | @@ -1575,6 +1595,7 @@ allow_write_and_free_vma: 1261 | if (vm_flags & VM_DENYWRITE) 1262 | allow_write_access(file); 1263 | free_vma: 1264 | + uksm_remove_vma(vma); 1265 | kmem_cache_free(vm_area_cachep, vma); 1266 | unacct_error: 1267 | if (charged) 1268 | @@ -2369,6 +2390,8 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, 1269 | else 1270 | err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); 1271 | 1272 | + uksm_vma_add_new(new); 1273 | + 1274 | /* Success. */ 1275 | if (!err) 1276 | return 0; 1277 | @@ -2639,6 +2662,7 @@ static int do_brk(unsigned long addr, unsigned long len) 1278 | return 0; 1279 | 1280 | flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; 1281 | + uksm_vm_flags_mod(&flags); 1282 | 1283 | error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); 1284 | if (offset_in_page(error)) 1285 | @@ -2696,6 +2720,7 @@ static int do_brk(unsigned long addr, unsigned long len) 1286 | vma->vm_flags = flags; 1287 | vma->vm_page_prot = vm_get_page_prot(flags); 1288 | vma_link(mm, vma, prev, rb_link, rb_parent); 1289 | + uksm_vma_add_new(vma); 1290 | out: 1291 | perf_event_mmap(vma); 1292 | mm->total_vm += len >> PAGE_SHIFT; 1293 | @@ -2734,6 +2759,12 @@ void exit_mmap(struct mm_struct *mm) 1294 | /* mm's last user has gone, and its about to be pulled down */ 1295 | mmu_notifier_release(mm); 1296 | 1297 | + /* 1298 | + * Taking write lock on mmap_sem does not harm others, 1299 | + * but it's crucial for uksm to avoid races. 1300 | + */ 1301 | + down_write(&mm->mmap_sem); 1302 | + 1303 | if (mm->locked_vm) { 1304 | vma = mm->mmap; 1305 | while (vma) { 1306 | @@ -2769,6 +2800,11 @@ void exit_mmap(struct mm_struct *mm) 1307 | vma = remove_vma(vma); 1308 | } 1309 | vm_unacct_memory(nr_accounted); 1310 | + 1311 | + mm->mmap = NULL; 1312 | + mm->mm_rb = RB_ROOT; 1313 | + vmacache_invalidate(mm); 1314 | + up_write(&mm->mmap_sem); 1315 | } 1316 | 1317 | /* Insert vm structure into process list sorted by address 1318 | @@ -2878,6 +2914,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, 1319 | new_vma->vm_ops->open(new_vma); 1320 | vma_link(mm, new_vma, prev, rb_link, rb_parent); 1321 | *need_rmap_locks = false; 1322 | + uksm_vma_add_new(new_vma); 1323 | } 1324 | return new_vma; 1325 | 1326 | @@ -3015,6 +3052,7 @@ static struct vm_area_struct *__install_special_mapping( 1327 | vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT); 1328 | 1329 | perf_event_mmap(vma); 1330 | + uksm_vma_add_new(vma); 1331 | 1332 | return vma; 1333 | 1334 | diff --git a/mm/rmap.c b/mm/rmap.c 1335 | index 701b93f..64ba784 100644 1336 | --- a/mm/rmap.c 1337 | +++ b/mm/rmap.c 1338 | @@ -1110,9 +1110,9 @@ void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma) 1339 | 1340 | /** 1341 | * __page_set_anon_rmap - set up new anonymous rmap 1342 | - * @page: Page to add to rmap 1343 | + * @page: Page to add to rmap 1344 | * @vma: VM area to add page to. 1345 | - * @address: User virtual address of the mapping 1346 | + * @address: User virtual address of the mapping 1347 | * @exclusive: the page is exclusively owned by the current process 1348 | */ 1349 | static void __page_set_anon_rmap(struct page *page, 1350 | diff --git a/mm/uksm.c b/mm/uksm.c 1351 | new file mode 100644 1352 | index 0000000..64b6dc6 1353 | --- /dev/null 1354 | +++ b/mm/uksm.c 1355 | @@ -0,0 +1,5545 @@ 1356 | +/* 1357 | + * Ultra KSM. Copyright (C) 2011-2012 Nai Xia 1358 | + * 1359 | + * This is an improvement upon KSM. Some basic data structures and routines 1360 | + * are borrowed from ksm.c . 1361 | + * 1362 | + * Its new features: 1363 | + * 1. Full system scan: 1364 | + * It automatically scans all user processes' anonymous VMAs. Kernel-user 1365 | + * interaction to submit a memory area to KSM is no longer needed. 1366 | + * 1367 | + * 2. Rich area detection: 1368 | + * It automatically detects rich areas containing abundant duplicated 1369 | + * pages based. Rich areas are given a full scan speed. Poor areas are 1370 | + * sampled at a reasonable speed with very low CPU consumption. 1371 | + * 1372 | + * 3. Ultra Per-page scan speed improvement: 1373 | + * A new hash algorithm is proposed. As a result, on a machine with 1374 | + * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it 1375 | + * can scan memory areas that does not contain duplicated pages at speed of 1376 | + * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of 1377 | + * 477MB/sec ~ 923MB/sec. 1378 | + * 1379 | + * 4. Thrashing area avoidance: 1380 | + * Thrashing area(an VMA that has frequent Ksm page break-out) can be 1381 | + * filtered out. My benchmark shows it's more efficient than KSM's per-page 1382 | + * hash value based volatile page detection. 1383 | + * 1384 | + * 1385 | + * 5. Misc changes upon KSM: 1386 | + * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page 1387 | + * comparison. It's much faster than default C version on x86. 1388 | + * * rmap_item now has an struct *page member to loosely cache a 1389 | + * address-->page mapping, which reduces too much time-costly 1390 | + * follow_page(). 1391 | + * * The VMA creation/exit procedures are hooked to let the Ultra KSM know. 1392 | + * * try_to_merge_two_pages() now can revert a pte if it fails. No break_ 1393 | + * ksm is needed for this case. 1394 | + * 1395 | + * 6. Full Zero Page consideration(contributed by Figo Zhang) 1396 | + * Now uksmd consider full zero pages as special pages and merge them to an 1397 | + * special unswappable uksm zero page. 1398 | + */ 1399 | + 1400 | +#include 1401 | +#include 1402 | +#include 1403 | +#include 1404 | +#include 1405 | +#include 1406 | +#include 1407 | +#include 1408 | +#include 1409 | +#include 1410 | +#include 1411 | +#include 1412 | +#include 1413 | +#include 1414 | +#include 1415 | +#include 1416 | +#include 1417 | +#include 1418 | +#include 1419 | +#include 1420 | +#include 1421 | +#include 1422 | +#include 1423 | +#include 1424 | +#include 1425 | +#include 1426 | +#include 1427 | + 1428 | +#include 1429 | +#include "internal.h" 1430 | + 1431 | +#ifdef CONFIG_X86 1432 | +#undef memcmp 1433 | + 1434 | +#ifdef CONFIG_X86_32 1435 | +#define memcmp memcmpx86_32 1436 | +/* 1437 | + * Compare 4-byte-aligned address s1 and s2, with length n 1438 | + */ 1439 | +int memcmpx86_32(void *s1, void *s2, size_t n) 1440 | +{ 1441 | + size_t num = n / 4; 1442 | + register int res; 1443 | + 1444 | + __asm__ __volatile__ 1445 | + ( 1446 | + "testl %3,%3\n\t" 1447 | + "repe; cmpsd\n\t" 1448 | + "je 1f\n\t" 1449 | + "sbbl %0,%0\n\t" 1450 | + "orl $1,%0\n" 1451 | + "1:" 1452 | + : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num) 1453 | + : "0" (0) 1454 | + : "cc"); 1455 | + 1456 | + return res; 1457 | +} 1458 | + 1459 | +/* 1460 | + * Check the page is all zero ? 1461 | + */ 1462 | +static int is_full_zero(const void *s1, size_t len) 1463 | +{ 1464 | + unsigned char same; 1465 | + 1466 | + len /= 4; 1467 | + 1468 | + __asm__ __volatile__ 1469 | + ("repe; scasl;" 1470 | + "sete %0" 1471 | + : "=qm" (same), "+D" (s1), "+c" (len) 1472 | + : "a" (0) 1473 | + : "cc"); 1474 | + 1475 | + return same; 1476 | +} 1477 | + 1478 | + 1479 | +#elif defined(CONFIG_X86_64) 1480 | +#define memcmp memcmpx86_64 1481 | +/* 1482 | + * Compare 8-byte-aligned address s1 and s2, with length n 1483 | + */ 1484 | +int memcmpx86_64(void *s1, void *s2, size_t n) 1485 | +{ 1486 | + size_t num = n / 8; 1487 | + register int res; 1488 | + 1489 | + __asm__ __volatile__ 1490 | + ( 1491 | + "testq %q3,%q3\n\t" 1492 | + "repe; cmpsq\n\t" 1493 | + "je 1f\n\t" 1494 | + "sbbq %q0,%q0\n\t" 1495 | + "orq $1,%q0\n" 1496 | + "1:" 1497 | + : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num) 1498 | + : "0" (0) 1499 | + : "cc"); 1500 | + 1501 | + return res; 1502 | +} 1503 | + 1504 | +static int is_full_zero(const void *s1, size_t len) 1505 | +{ 1506 | + unsigned char same; 1507 | + 1508 | + len /= 8; 1509 | + 1510 | + __asm__ __volatile__ 1511 | + ("repe; scasq;" 1512 | + "sete %0" 1513 | + : "=qm" (same), "+D" (s1), "+c" (len) 1514 | + : "a" (0) 1515 | + : "cc"); 1516 | + 1517 | + return same; 1518 | +} 1519 | + 1520 | +#endif 1521 | +#else 1522 | +static int is_full_zero(const void *s1, size_t len) 1523 | +{ 1524 | + unsigned long *src = s1; 1525 | + int i; 1526 | + 1527 | + len /= sizeof(*src); 1528 | + 1529 | + for (i = 0; i < len; i++) { 1530 | + if (src[i]) 1531 | + return 0; 1532 | + } 1533 | + 1534 | + return 1; 1535 | +} 1536 | +#endif 1537 | + 1538 | +#define UKSM_RUNG_ROUND_FINISHED (1 << 0) 1539 | +#define TIME_RATIO_SCALE 10000 1540 | + 1541 | +#define SLOT_TREE_NODE_SHIFT 8 1542 | +#define SLOT_TREE_NODE_STORE_SIZE (1UL << SLOT_TREE_NODE_SHIFT) 1543 | +struct slot_tree_node { 1544 | + unsigned long size; 1545 | + struct sradix_tree_node snode; 1546 | + void *stores[SLOT_TREE_NODE_STORE_SIZE]; 1547 | +}; 1548 | + 1549 | +static struct kmem_cache *slot_tree_node_cachep; 1550 | + 1551 | +static struct sradix_tree_node *slot_tree_node_alloc(void) 1552 | +{ 1553 | + struct slot_tree_node *p; 1554 | + p = kmem_cache_zalloc(slot_tree_node_cachep, GFP_KERNEL); 1555 | + if (!p) 1556 | + return NULL; 1557 | + 1558 | + return &p->snode; 1559 | +} 1560 | + 1561 | +static void slot_tree_node_free(struct sradix_tree_node *node) 1562 | +{ 1563 | + struct slot_tree_node *p; 1564 | + 1565 | + p = container_of(node, struct slot_tree_node, snode); 1566 | + kmem_cache_free(slot_tree_node_cachep, p); 1567 | +} 1568 | + 1569 | +static void slot_tree_node_extend(struct sradix_tree_node *parent, 1570 | + struct sradix_tree_node *child) 1571 | +{ 1572 | + struct slot_tree_node *p, *c; 1573 | + 1574 | + p = container_of(parent, struct slot_tree_node, snode); 1575 | + c = container_of(child, struct slot_tree_node, snode); 1576 | + 1577 | + p->size += c->size; 1578 | +} 1579 | + 1580 | +void slot_tree_node_assign(struct sradix_tree_node *node, 1581 | + unsigned index, void *item) 1582 | +{ 1583 | + struct vma_slot *slot = item; 1584 | + struct slot_tree_node *cur; 1585 | + 1586 | + slot->snode = node; 1587 | + slot->sindex = index; 1588 | + 1589 | + while (node) { 1590 | + cur = container_of(node, struct slot_tree_node, snode); 1591 | + cur->size += slot->pages; 1592 | + node = node->parent; 1593 | + } 1594 | +} 1595 | + 1596 | +void slot_tree_node_rm(struct sradix_tree_node *node, unsigned offset) 1597 | +{ 1598 | + struct vma_slot *slot; 1599 | + struct slot_tree_node *cur; 1600 | + unsigned long pages; 1601 | + 1602 | + if (node->height == 1) { 1603 | + slot = node->stores[offset]; 1604 | + pages = slot->pages; 1605 | + } else { 1606 | + cur = container_of(node->stores[offset], 1607 | + struct slot_tree_node, snode); 1608 | + pages = cur->size; 1609 | + } 1610 | + 1611 | + while (node) { 1612 | + cur = container_of(node, struct slot_tree_node, snode); 1613 | + cur->size -= pages; 1614 | + node = node->parent; 1615 | + } 1616 | +} 1617 | + 1618 | +unsigned long slot_iter_index; 1619 | +int slot_iter(void *item, unsigned long height) 1620 | +{ 1621 | + struct slot_tree_node *node; 1622 | + struct vma_slot *slot; 1623 | + 1624 | + if (height == 1) { 1625 | + slot = item; 1626 | + if (slot_iter_index < slot->pages) { 1627 | + /*in this one*/ 1628 | + return 1; 1629 | + } else { 1630 | + slot_iter_index -= slot->pages; 1631 | + return 0; 1632 | + } 1633 | + 1634 | + } else { 1635 | + node = container_of(item, struct slot_tree_node, snode); 1636 | + if (slot_iter_index < node->size) { 1637 | + /*in this one*/ 1638 | + return 1; 1639 | + } else { 1640 | + slot_iter_index -= node->size; 1641 | + return 0; 1642 | + } 1643 | + } 1644 | +} 1645 | + 1646 | + 1647 | +static inline void slot_tree_init_root(struct sradix_tree_root *root) 1648 | +{ 1649 | + init_sradix_tree_root(root, SLOT_TREE_NODE_SHIFT); 1650 | + root->alloc = slot_tree_node_alloc; 1651 | + root->free = slot_tree_node_free; 1652 | + root->extend = slot_tree_node_extend; 1653 | + root->assign = slot_tree_node_assign; 1654 | + root->rm = slot_tree_node_rm; 1655 | +} 1656 | + 1657 | +void slot_tree_init(void) 1658 | +{ 1659 | + slot_tree_node_cachep = kmem_cache_create("slot_tree_node", 1660 | + sizeof(struct slot_tree_node), 0, 1661 | + SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, 1662 | + NULL); 1663 | +} 1664 | + 1665 | + 1666 | +/* Each rung of this ladder is a list of VMAs having a same scan ratio */ 1667 | +struct scan_rung { 1668 | + //struct list_head scanned_list; 1669 | + struct sradix_tree_root vma_root; 1670 | + struct sradix_tree_root vma_root2; 1671 | + 1672 | + struct vma_slot *current_scan; 1673 | + unsigned long current_offset; 1674 | + 1675 | + /* 1676 | + * The initial value for current_offset, it should loop over 1677 | + * [0~ step - 1] to let all slot have its chance to be scanned. 1678 | + */ 1679 | + unsigned long offset_init; 1680 | + unsigned long step; /* dynamic step for current_offset */ 1681 | + unsigned int flags; 1682 | + unsigned long pages_to_scan; 1683 | + //unsigned long fully_scanned_slots; 1684 | + /* 1685 | + * a little bit tricky - if cpu_time_ratio > 0, then the value is the 1686 | + * the cpu time ratio it can spend in rung_i for every scan 1687 | + * period. if < 0, then it is the cpu time ratio relative to the 1688 | + * max cpu percentage user specified. Both in unit of 1689 | + * 1/TIME_RATIO_SCALE 1690 | + */ 1691 | + int cpu_ratio; 1692 | + 1693 | + /* 1694 | + * How long it will take for all slots in this rung to be fully 1695 | + * scanned? If it's zero, we don't care about the cover time: 1696 | + * it's fully scanned. 1697 | + */ 1698 | + unsigned int cover_msecs; 1699 | + //unsigned long vma_num; 1700 | + //unsigned long pages; /* Sum of all slot's pages in rung */ 1701 | +}; 1702 | + 1703 | +/** 1704 | + * node of either the stable or unstale rbtree 1705 | + * 1706 | + */ 1707 | +struct tree_node { 1708 | + struct rb_node node; /* link in the main (un)stable rbtree */ 1709 | + struct rb_root sub_root; /* rb_root for sublevel collision rbtree */ 1710 | + u32 hash; 1711 | + unsigned long count; /* TODO: merged with sub_root */ 1712 | + struct list_head all_list; /* all tree nodes in stable/unstable tree */ 1713 | +}; 1714 | + 1715 | +/** 1716 | + * struct stable_node - node of the stable rbtree 1717 | + * @node: rb node of this ksm page in the stable tree 1718 | + * @hlist: hlist head of rmap_items using this ksm page 1719 | + * @kpfn: page frame number of this ksm page 1720 | + */ 1721 | +struct stable_node { 1722 | + struct rb_node node; /* link in sub-rbtree */ 1723 | + struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */ 1724 | + struct hlist_head hlist; 1725 | + unsigned long kpfn; 1726 | + u32 hash_max; /* if ==0 then it's not been calculated yet */ 1727 | + struct list_head all_list; /* in a list for all stable nodes */ 1728 | +}; 1729 | + 1730 | +/** 1731 | + * struct node_vma - group rmap_items linked in a same stable 1732 | + * node together. 1733 | + */ 1734 | +struct node_vma { 1735 | + union { 1736 | + struct vma_slot *slot; 1737 | + unsigned long key; /* slot is used as key sorted on hlist */ 1738 | + }; 1739 | + struct hlist_node hlist; 1740 | + struct hlist_head rmap_hlist; 1741 | + struct stable_node *head; 1742 | +}; 1743 | + 1744 | +/** 1745 | + * struct rmap_item - reverse mapping item for virtual addresses 1746 | + * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list 1747 | + * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree 1748 | + * @mm: the memory structure this rmap_item is pointing into 1749 | + * @address: the virtual address this rmap_item tracks (+ flags in low bits) 1750 | + * @node: rb node of this rmap_item in the unstable tree 1751 | + * @head: pointer to stable_node heading this list in the stable tree 1752 | + * @hlist: link into hlist of rmap_items hanging off that stable_node 1753 | + */ 1754 | +struct rmap_item { 1755 | + struct vma_slot *slot; 1756 | + struct page *page; 1757 | + unsigned long address; /* + low bits used for flags below */ 1758 | + unsigned long hash_round; 1759 | + unsigned long entry_index; 1760 | + union { 1761 | + struct {/* when in unstable tree */ 1762 | + struct rb_node node; 1763 | + struct tree_node *tree_node; 1764 | + u32 hash_max; 1765 | + }; 1766 | + struct { /* when in stable tree */ 1767 | + struct node_vma *head; 1768 | + struct hlist_node hlist; 1769 | + struct anon_vma *anon_vma; 1770 | + }; 1771 | + }; 1772 | +} __attribute__((aligned(4))); 1773 | + 1774 | +struct rmap_list_entry { 1775 | + union { 1776 | + struct rmap_item *item; 1777 | + unsigned long addr; 1778 | + }; 1779 | + /* lowest bit is used for is_addr tag */ 1780 | +} __attribute__((aligned(4))); /* 4 aligned to fit in to pages*/ 1781 | + 1782 | + 1783 | +/* Basic data structure definition ends */ 1784 | + 1785 | + 1786 | +/* 1787 | + * Flags for rmap_item to judge if it's listed in the stable/unstable tree. 1788 | + * The flags use the low bits of rmap_item.address 1789 | + */ 1790 | +#define UNSTABLE_FLAG 0x1 1791 | +#define STABLE_FLAG 0x2 1792 | +#define get_rmap_addr(x) ((x)->address & PAGE_MASK) 1793 | + 1794 | +/* 1795 | + * rmap_list_entry helpers 1796 | + */ 1797 | +#define IS_ADDR_FLAG 1 1798 | +#define is_addr(ptr) ((unsigned long)(ptr) & IS_ADDR_FLAG) 1799 | +#define set_is_addr(ptr) ((ptr) |= IS_ADDR_FLAG) 1800 | +#define get_clean_addr(ptr) (((ptr) & ~(__typeof__(ptr))IS_ADDR_FLAG)) 1801 | + 1802 | + 1803 | +/* 1804 | + * High speed caches for frequently allocated and freed structs 1805 | + */ 1806 | +static struct kmem_cache *rmap_item_cache; 1807 | +static struct kmem_cache *stable_node_cache; 1808 | +static struct kmem_cache *node_vma_cache; 1809 | +static struct kmem_cache *vma_slot_cache; 1810 | +static struct kmem_cache *tree_node_cache; 1811 | +#define UKSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("uksm_"#__struct,\ 1812 | + sizeof(struct __struct), __alignof__(struct __struct),\ 1813 | + (__flags), NULL) 1814 | + 1815 | +/* Array of all scan_rung, uksm_scan_ladder[0] having the minimum scan ratio */ 1816 | +#define SCAN_LADDER_SIZE 4 1817 | +static struct scan_rung uksm_scan_ladder[SCAN_LADDER_SIZE]; 1818 | + 1819 | +/* The evaluation rounds uksmd has finished */ 1820 | +static unsigned long long uksm_eval_round = 1; 1821 | + 1822 | +/* 1823 | + * we add 1 to this var when we consider we should rebuild the whole 1824 | + * unstable tree. 1825 | + */ 1826 | +static unsigned long uksm_hash_round = 1; 1827 | + 1828 | +/* 1829 | + * How many times the whole memory is scanned. 1830 | + */ 1831 | +static unsigned long long fully_scanned_round = 1; 1832 | + 1833 | +/* The total number of virtual pages of all vma slots */ 1834 | +static u64 uksm_pages_total; 1835 | + 1836 | +/* The number of pages has been scanned since the start up */ 1837 | +static u64 uksm_pages_scanned; 1838 | + 1839 | +static u64 scanned_virtual_pages; 1840 | + 1841 | +/* The number of pages has been scanned since last encode_benefit call */ 1842 | +static u64 uksm_pages_scanned_last; 1843 | + 1844 | +/* If the scanned number is tooo large, we encode it here */ 1845 | +static u64 pages_scanned_stored; 1846 | + 1847 | +static unsigned long pages_scanned_base; 1848 | + 1849 | +/* The number of nodes in the stable tree */ 1850 | +static unsigned long uksm_pages_shared; 1851 | + 1852 | +/* The number of page slots additionally sharing those nodes */ 1853 | +static unsigned long uksm_pages_sharing; 1854 | + 1855 | +/* The number of nodes in the unstable tree */ 1856 | +static unsigned long uksm_pages_unshared; 1857 | + 1858 | +/* 1859 | + * Milliseconds ksmd should sleep between scans, 1860 | + * >= 100ms to be consistent with 1861 | + * scan_time_to_sleep_msec() 1862 | + */ 1863 | +static unsigned int uksm_sleep_jiffies; 1864 | + 1865 | +/* The real value for the uksmd next sleep */ 1866 | +static unsigned int uksm_sleep_real; 1867 | + 1868 | +/* Saved value for user input uksm_sleep_jiffies when it's enlarged */ 1869 | +static unsigned int uksm_sleep_saved; 1870 | + 1871 | +/* Max percentage of cpu utilization ksmd can take to scan in one batch */ 1872 | +static unsigned int uksm_max_cpu_percentage; 1873 | + 1874 | +static int uksm_cpu_governor; 1875 | + 1876 | +static char *uksm_cpu_governor_str[4] = { "full", "medium", "low", "quiet" }; 1877 | + 1878 | +struct uksm_cpu_preset_s { 1879 | + int cpu_ratio[SCAN_LADDER_SIZE]; 1880 | + unsigned int cover_msecs[SCAN_LADDER_SIZE]; 1881 | + unsigned int max_cpu; /* percentage */ 1882 | +}; 1883 | + 1884 | +struct uksm_cpu_preset_s uksm_cpu_preset[4] = { 1885 | + { {20, 40, -2500, -10000}, {1000, 500, 200, 50}, 95}, 1886 | + { {20, 30, -2500, -10000}, {1000, 500, 400, 100}, 50}, 1887 | + { {10, 20, -5000, -10000}, {1500, 1000, 1000, 250}, 20}, 1888 | + { {10, 20, 40, 75}, {2000, 1000, 1000, 1000}, 1}, 1889 | +}; 1890 | + 1891 | +/* The default value for uksm_ema_page_time if it's not initialized */ 1892 | +#define UKSM_PAGE_TIME_DEFAULT 500 1893 | + 1894 | +/*cost to scan one page by expotional moving average in nsecs */ 1895 | +static unsigned long uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT; 1896 | + 1897 | +/* The expotional moving average alpha weight, in percentage. */ 1898 | +#define EMA_ALPHA 20 1899 | + 1900 | +/* 1901 | + * The threshold used to filter out thrashing areas, 1902 | + * If it == 0, filtering is disabled, otherwise it's the percentage up-bound 1903 | + * of the thrashing ratio of all areas. Any area with a bigger thrashing ratio 1904 | + * will be considered as having a zero duplication ratio. 1905 | + */ 1906 | +static unsigned int uksm_thrash_threshold = 50; 1907 | + 1908 | +/* How much dedup ratio is considered to be abundant*/ 1909 | +static unsigned int uksm_abundant_threshold = 10; 1910 | + 1911 | +/* All slots having merged pages in this eval round. */ 1912 | +struct list_head vma_slot_dedup = LIST_HEAD_INIT(vma_slot_dedup); 1913 | + 1914 | +/* How many times the ksmd has slept since startup */ 1915 | +static unsigned long long uksm_sleep_times; 1916 | + 1917 | +#define UKSM_RUN_STOP 0 1918 | +#define UKSM_RUN_MERGE 1 1919 | +static unsigned int uksm_run = 1; 1920 | + 1921 | +static DECLARE_WAIT_QUEUE_HEAD(uksm_thread_wait); 1922 | +static DEFINE_MUTEX(uksm_thread_mutex); 1923 | + 1924 | +/* 1925 | + * List vma_slot_new is for newly created vma_slot waiting to be added by 1926 | + * ksmd. If one cannot be added(e.g. due to it's too small), it's moved to 1927 | + * vma_slot_noadd. vma_slot_del is the list for vma_slot whose corresponding 1928 | + * VMA has been removed/freed. 1929 | + */ 1930 | +struct list_head vma_slot_new = LIST_HEAD_INIT(vma_slot_new); 1931 | +struct list_head vma_slot_noadd = LIST_HEAD_INIT(vma_slot_noadd); 1932 | +struct list_head vma_slot_del = LIST_HEAD_INIT(vma_slot_del); 1933 | +static DEFINE_SPINLOCK(vma_slot_list_lock); 1934 | + 1935 | +/* The unstable tree heads */ 1936 | +static struct rb_root root_unstable_tree = RB_ROOT; 1937 | + 1938 | +/* 1939 | + * All tree_nodes are in a list to be freed at once when unstable tree is 1940 | + * freed after each scan round. 1941 | + */ 1942 | +static struct list_head unstable_tree_node_list = 1943 | + LIST_HEAD_INIT(unstable_tree_node_list); 1944 | + 1945 | +/* List contains all stable nodes */ 1946 | +static struct list_head stable_node_list = LIST_HEAD_INIT(stable_node_list); 1947 | + 1948 | +/* 1949 | + * When the hash strength is changed, the stable tree must be delta_hashed and 1950 | + * re-structured. We use two set of below structs to speed up the 1951 | + * re-structuring of stable tree. 1952 | + */ 1953 | +static struct list_head 1954 | +stable_tree_node_list[2] = {LIST_HEAD_INIT(stable_tree_node_list[0]), 1955 | + LIST_HEAD_INIT(stable_tree_node_list[1])}; 1956 | + 1957 | +static struct list_head *stable_tree_node_listp = &stable_tree_node_list[0]; 1958 | +static struct rb_root root_stable_tree[2] = {RB_ROOT, RB_ROOT}; 1959 | +static struct rb_root *root_stable_treep = &root_stable_tree[0]; 1960 | +static unsigned long stable_tree_index; 1961 | + 1962 | +/* The hash strength needed to hash a full page */ 1963 | +#define HASH_STRENGTH_FULL (PAGE_SIZE / sizeof(u32)) 1964 | + 1965 | +/* The hash strength needed for loop-back hashing */ 1966 | +#define HASH_STRENGTH_MAX (HASH_STRENGTH_FULL + 10) 1967 | + 1968 | +/* The random offsets in a page */ 1969 | +static u32 *random_nums; 1970 | + 1971 | +/* The hash strength */ 1972 | +static unsigned long hash_strength = HASH_STRENGTH_FULL >> 4; 1973 | + 1974 | +/* The delta value each time the hash strength increases or decreases */ 1975 | +static unsigned long hash_strength_delta; 1976 | +#define HASH_STRENGTH_DELTA_MAX 5 1977 | + 1978 | +/* The time we have saved due to random_sample_hash */ 1979 | +static u64 rshash_pos; 1980 | + 1981 | +/* The time we have wasted due to hash collision */ 1982 | +static u64 rshash_neg; 1983 | + 1984 | +struct uksm_benefit { 1985 | + u64 pos; 1986 | + u64 neg; 1987 | + u64 scanned; 1988 | + unsigned long base; 1989 | +} benefit; 1990 | + 1991 | +/* 1992 | + * The relative cost of memcmp, compared to 1 time unit of random sample 1993 | + * hash, this value is tested when ksm module is initialized 1994 | + */ 1995 | +static unsigned long memcmp_cost; 1996 | + 1997 | +static unsigned long rshash_neg_cont_zero; 1998 | +static unsigned long rshash_cont_obscure; 1999 | + 2000 | +/* The possible states of hash strength adjustment heuristic */ 2001 | +enum rshash_states { 2002 | + RSHASH_STILL, 2003 | + RSHASH_TRYUP, 2004 | + RSHASH_TRYDOWN, 2005 | + RSHASH_NEW, 2006 | + RSHASH_PRE_STILL, 2007 | +}; 2008 | + 2009 | +/* The possible direction we are about to adjust hash strength */ 2010 | +enum rshash_direct { 2011 | + GO_UP, 2012 | + GO_DOWN, 2013 | + OBSCURE, 2014 | + STILL, 2015 | +}; 2016 | + 2017 | +/* random sampling hash state machine */ 2018 | +static struct { 2019 | + enum rshash_states state; 2020 | + enum rshash_direct pre_direct; 2021 | + u8 below_count; 2022 | + /* Keep a lookup window of size 5, iff above_count/below_count > 3 2023 | + * in this window we stop trying. 2024 | + */ 2025 | + u8 lookup_window_index; 2026 | + u64 stable_benefit; 2027 | + unsigned long turn_point_down; 2028 | + unsigned long turn_benefit_down; 2029 | + unsigned long turn_point_up; 2030 | + unsigned long turn_benefit_up; 2031 | + unsigned long stable_point; 2032 | +} rshash_state; 2033 | + 2034 | +/*zero page hash table, hash_strength [0 ~ HASH_STRENGTH_MAX]*/ 2035 | +static u32 *zero_hash_table; 2036 | + 2037 | +static inline struct node_vma *alloc_node_vma(void) 2038 | +{ 2039 | + struct node_vma *node_vma; 2040 | + node_vma = kmem_cache_zalloc(node_vma_cache, GFP_KERNEL); 2041 | + if (node_vma) { 2042 | + INIT_HLIST_HEAD(&node_vma->rmap_hlist); 2043 | + INIT_HLIST_NODE(&node_vma->hlist); 2044 | + } 2045 | + return node_vma; 2046 | +} 2047 | + 2048 | +static inline void free_node_vma(struct node_vma *node_vma) 2049 | +{ 2050 | + kmem_cache_free(node_vma_cache, node_vma); 2051 | +} 2052 | + 2053 | + 2054 | +static inline struct vma_slot *alloc_vma_slot(void) 2055 | +{ 2056 | + struct vma_slot *slot; 2057 | + 2058 | + /* 2059 | + * In case ksm is not initialized by now. 2060 | + * Oops, we need to consider the call site of uksm_init() in the future. 2061 | + */ 2062 | + if (!vma_slot_cache) 2063 | + return NULL; 2064 | + 2065 | + slot = kmem_cache_zalloc(vma_slot_cache, GFP_KERNEL); 2066 | + if (slot) { 2067 | + INIT_LIST_HEAD(&slot->slot_list); 2068 | + INIT_LIST_HEAD(&slot->dedup_list); 2069 | + slot->flags |= UKSM_SLOT_NEED_RERAND; 2070 | + } 2071 | + return slot; 2072 | +} 2073 | + 2074 | +static inline void free_vma_slot(struct vma_slot *vma_slot) 2075 | +{ 2076 | + kmem_cache_free(vma_slot_cache, vma_slot); 2077 | +} 2078 | + 2079 | + 2080 | + 2081 | +static inline struct rmap_item *alloc_rmap_item(void) 2082 | +{ 2083 | + struct rmap_item *rmap_item; 2084 | + 2085 | + rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL); 2086 | + if (rmap_item) { 2087 | + /* bug on lowest bit is not clear for flag use */ 2088 | + BUG_ON(is_addr(rmap_item)); 2089 | + } 2090 | + return rmap_item; 2091 | +} 2092 | + 2093 | +static inline void free_rmap_item(struct rmap_item *rmap_item) 2094 | +{ 2095 | + rmap_item->slot = NULL; /* debug safety */ 2096 | + kmem_cache_free(rmap_item_cache, rmap_item); 2097 | +} 2098 | + 2099 | +static inline struct stable_node *alloc_stable_node(void) 2100 | +{ 2101 | + struct stable_node *node; 2102 | + node = kmem_cache_alloc(stable_node_cache, GFP_KERNEL | GFP_ATOMIC); 2103 | + if (!node) 2104 | + return NULL; 2105 | + 2106 | + INIT_HLIST_HEAD(&node->hlist); 2107 | + list_add(&node->all_list, &stable_node_list); 2108 | + return node; 2109 | +} 2110 | + 2111 | +static inline void free_stable_node(struct stable_node *stable_node) 2112 | +{ 2113 | + list_del(&stable_node->all_list); 2114 | + kmem_cache_free(stable_node_cache, stable_node); 2115 | +} 2116 | + 2117 | +static inline struct tree_node *alloc_tree_node(struct list_head *list) 2118 | +{ 2119 | + struct tree_node *node; 2120 | + node = kmem_cache_zalloc(tree_node_cache, GFP_KERNEL | GFP_ATOMIC); 2121 | + if (!node) 2122 | + return NULL; 2123 | + 2124 | + list_add(&node->all_list, list); 2125 | + return node; 2126 | +} 2127 | + 2128 | +static inline void free_tree_node(struct tree_node *node) 2129 | +{ 2130 | + list_del(&node->all_list); 2131 | + kmem_cache_free(tree_node_cache, node); 2132 | +} 2133 | + 2134 | +static void uksm_drop_anon_vma(struct rmap_item *rmap_item) 2135 | +{ 2136 | + struct anon_vma *anon_vma = rmap_item->anon_vma; 2137 | + 2138 | + put_anon_vma(anon_vma); 2139 | +} 2140 | + 2141 | + 2142 | +/** 2143 | + * Remove a stable node from stable_tree, may unlink from its tree_node and 2144 | + * may remove its parent tree_node if no other stable node is pending. 2145 | + * 2146 | + * @stable_node The node need to be removed 2147 | + * @unlink_rb Will this node be unlinked from the rbtree? 2148 | + * @remove_tree_ node Will its tree_node be removed if empty? 2149 | + */ 2150 | +static void remove_node_from_stable_tree(struct stable_node *stable_node, 2151 | + int unlink_rb, int remove_tree_node) 2152 | +{ 2153 | + struct node_vma *node_vma; 2154 | + struct rmap_item *rmap_item; 2155 | + struct hlist_node *n; 2156 | + 2157 | + if (!hlist_empty(&stable_node->hlist)) { 2158 | + hlist_for_each_entry_safe(node_vma, n, 2159 | + &stable_node->hlist, hlist) { 2160 | + hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) { 2161 | + uksm_pages_sharing--; 2162 | + 2163 | + uksm_drop_anon_vma(rmap_item); 2164 | + rmap_item->address &= PAGE_MASK; 2165 | + } 2166 | + free_node_vma(node_vma); 2167 | + cond_resched(); 2168 | + } 2169 | + 2170 | + /* the last one is counted as shared */ 2171 | + uksm_pages_shared--; 2172 | + uksm_pages_sharing++; 2173 | + } 2174 | + 2175 | + if (stable_node->tree_node && unlink_rb) { 2176 | + rb_erase(&stable_node->node, 2177 | + &stable_node->tree_node->sub_root); 2178 | + 2179 | + if (RB_EMPTY_ROOT(&stable_node->tree_node->sub_root) && 2180 | + remove_tree_node) { 2181 | + rb_erase(&stable_node->tree_node->node, 2182 | + root_stable_treep); 2183 | + free_tree_node(stable_node->tree_node); 2184 | + } else { 2185 | + stable_node->tree_node->count--; 2186 | + } 2187 | + } 2188 | + 2189 | + free_stable_node(stable_node); 2190 | +} 2191 | + 2192 | + 2193 | +/* 2194 | + * get_uksm_page: checks if the page indicated by the stable node 2195 | + * is still its ksm page, despite having held no reference to it. 2196 | + * In which case we can trust the content of the page, and it 2197 | + * returns the gotten page; but if the page has now been zapped, 2198 | + * remove the stale node from the stable tree and return NULL. 2199 | + * 2200 | + * You would expect the stable_node to hold a reference to the ksm page. 2201 | + * But if it increments the page's count, swapping out has to wait for 2202 | + * ksmd to come around again before it can free the page, which may take 2203 | + * seconds or even minutes: much too unresponsive. So instead we use a 2204 | + * "keyhole reference": access to the ksm page from the stable node peeps 2205 | + * out through its keyhole to see if that page still holds the right key, 2206 | + * pointing back to this stable node. This relies on freeing a PageAnon 2207 | + * page to reset its page->mapping to NULL, and relies on no other use of 2208 | + * a page to put something that might look like our key in page->mapping. 2209 | + * 2210 | + * include/linux/pagemap.h page_cache_get_speculative() is a good reference, 2211 | + * but this is different - made simpler by uksm_thread_mutex being held, but 2212 | + * interesting for assuming that no other use of the struct page could ever 2213 | + * put our expected_mapping into page->mapping (or a field of the union which 2214 | + * coincides with page->mapping). The RCU calls are not for KSM at all, but 2215 | + * to keep the page_count protocol described with page_cache_get_speculative. 2216 | + * 2217 | + * Note: it is possible that get_uksm_page() will return NULL one moment, 2218 | + * then page the next, if the page is in between page_freeze_refs() and 2219 | + * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page 2220 | + * is on its way to being freed; but it is an anomaly to bear in mind. 2221 | + * 2222 | + * @unlink_rb: if the removal of this node will firstly unlink from 2223 | + * its rbtree. stable_node_reinsert will prevent this when restructuring the 2224 | + * node from its old tree. 2225 | + * 2226 | + * @remove_tree_node: if this is the last one of its tree_node, will the 2227 | + * tree_node be freed ? If we are inserting stable node, this tree_node may 2228 | + * be reused, so don't free it. 2229 | + */ 2230 | +static struct page *get_uksm_page(struct stable_node *stable_node, 2231 | + int unlink_rb, int remove_tree_node) 2232 | +{ 2233 | + struct page *page; 2234 | + void *expected_mapping; 2235 | + unsigned long kpfn; 2236 | + 2237 | +again: 2238 | + kpfn = stable_node->kpfn; 2239 | + page = pfn_to_page(kpfn); 2240 | + expected_mapping = (void *)stable_node + 2241 | + (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); 2242 | + 2243 | + if (page->mapping != expected_mapping) 2244 | + goto stale; 2245 | + if (!get_page_unless_zero(page)) 2246 | + goto stale; 2247 | + if (page->mapping != expected_mapping) { 2248 | + put_page(page); 2249 | + goto stale; 2250 | + } 2251 | + 2252 | + lock_page(page); 2253 | + if (page->mapping != expected_mapping) { 2254 | + unlock_page(page); 2255 | + put_page(page); 2256 | + goto stale; 2257 | + } 2258 | + unlock_page(page); 2259 | + return page; 2260 | +stale: 2261 | + /* 2262 | + * We come here from above when page->mapping or !PageSwapCache 2263 | + * suggests that the node is stale; but it might be under migration. 2264 | + * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(), 2265 | + * before checking whether node->kpfn has been changed. 2266 | + */ 2267 | + smp_rmb(); 2268 | + if (stable_node->kpfn != kpfn) 2269 | + goto again; 2270 | + 2271 | + remove_node_from_stable_tree(stable_node, unlink_rb, remove_tree_node); 2272 | + 2273 | + return NULL; 2274 | +} 2275 | + 2276 | +/* 2277 | + * Removing rmap_item from stable or unstable tree. 2278 | + * This function will clean the information from the stable/unstable tree. 2279 | + */ 2280 | +static inline void remove_rmap_item_from_tree(struct rmap_item *rmap_item) 2281 | +{ 2282 | + if (rmap_item->address & STABLE_FLAG) { 2283 | + struct stable_node *stable_node; 2284 | + struct node_vma *node_vma; 2285 | + struct page *page; 2286 | + 2287 | + node_vma = rmap_item->head; 2288 | + stable_node = node_vma->head; 2289 | + page = get_uksm_page(stable_node, 1, 1); 2290 | + if (!page) 2291 | + goto out; 2292 | + 2293 | + /* 2294 | + * page lock is needed because it's racing with 2295 | + * try_to_unmap_ksm(), etc. 2296 | + */ 2297 | + lock_page(page); 2298 | + hlist_del(&rmap_item->hlist); 2299 | + 2300 | + if (hlist_empty(&node_vma->rmap_hlist)) { 2301 | + hlist_del(&node_vma->hlist); 2302 | + free_node_vma(node_vma); 2303 | + } 2304 | + unlock_page(page); 2305 | + 2306 | + put_page(page); 2307 | + if (hlist_empty(&stable_node->hlist)) { 2308 | + /* do NOT call remove_node_from_stable_tree() here, 2309 | + * it's possible for a forked rmap_item not in 2310 | + * stable tree while the in-tree rmap_items were 2311 | + * deleted. 2312 | + */ 2313 | + uksm_pages_shared--; 2314 | + } else 2315 | + uksm_pages_sharing--; 2316 | + 2317 | + 2318 | + uksm_drop_anon_vma(rmap_item); 2319 | + } else if (rmap_item->address & UNSTABLE_FLAG) { 2320 | + if (rmap_item->hash_round == uksm_hash_round) { 2321 | + 2322 | + rb_erase(&rmap_item->node, 2323 | + &rmap_item->tree_node->sub_root); 2324 | + if (RB_EMPTY_ROOT(&rmap_item->tree_node->sub_root)) { 2325 | + rb_erase(&rmap_item->tree_node->node, 2326 | + &root_unstable_tree); 2327 | + 2328 | + free_tree_node(rmap_item->tree_node); 2329 | + } else 2330 | + rmap_item->tree_node->count--; 2331 | + } 2332 | + uksm_pages_unshared--; 2333 | + } 2334 | + 2335 | + rmap_item->address &= PAGE_MASK; 2336 | + rmap_item->hash_max = 0; 2337 | + 2338 | +out: 2339 | + cond_resched(); /* we're called from many long loops */ 2340 | +} 2341 | + 2342 | +static inline int slot_in_uksm(struct vma_slot *slot) 2343 | +{ 2344 | + return list_empty(&slot->slot_list); 2345 | +} 2346 | + 2347 | +/* 2348 | + * Test if the mm is exiting 2349 | + */ 2350 | +static inline bool uksm_test_exit(struct mm_struct *mm) 2351 | +{ 2352 | + return atomic_read(&mm->mm_users) == 0; 2353 | +} 2354 | + 2355 | +static inline unsigned long vma_pool_size(struct vma_slot *slot) 2356 | +{ 2357 | + return round_up(sizeof(struct rmap_list_entry) * slot->pages, 2358 | + PAGE_SIZE) >> PAGE_SHIFT; 2359 | +} 2360 | + 2361 | +#define CAN_OVERFLOW_U64(x, delta) (U64_MAX - (x) < (delta)) 2362 | + 2363 | +/* must be done with sem locked */ 2364 | +static int slot_pool_alloc(struct vma_slot *slot) 2365 | +{ 2366 | + unsigned long pool_size; 2367 | + 2368 | + if (slot->rmap_list_pool) 2369 | + return 0; 2370 | + 2371 | + pool_size = vma_pool_size(slot); 2372 | + slot->rmap_list_pool = kzalloc(sizeof(struct page *) * 2373 | + pool_size, GFP_KERNEL); 2374 | + if (!slot->rmap_list_pool) 2375 | + return -ENOMEM; 2376 | + 2377 | + slot->pool_counts = kzalloc(sizeof(unsigned int) * pool_size, 2378 | + GFP_KERNEL); 2379 | + if (!slot->pool_counts) { 2380 | + kfree(slot->rmap_list_pool); 2381 | + return -ENOMEM; 2382 | + } 2383 | + 2384 | + slot->pool_size = pool_size; 2385 | + BUG_ON(CAN_OVERFLOW_U64(uksm_pages_total, slot->pages)); 2386 | + slot->flags |= UKSM_SLOT_IN_UKSM; 2387 | + uksm_pages_total += slot->pages; 2388 | + 2389 | + return 0; 2390 | +} 2391 | + 2392 | +/* 2393 | + * Called after vma is unlinked from its mm 2394 | + */ 2395 | +void uksm_remove_vma(struct vm_area_struct *vma) 2396 | +{ 2397 | + struct vma_slot *slot; 2398 | + 2399 | + if (!vma->uksm_vma_slot) 2400 | + return; 2401 | + 2402 | + spin_lock(&vma_slot_list_lock); 2403 | + slot = vma->uksm_vma_slot; 2404 | + if (!slot) 2405 | + goto out; 2406 | + 2407 | + if (slot_in_uksm(slot)) { 2408 | + /** 2409 | + * This slot has been added by ksmd, so move to the del list 2410 | + * waiting ksmd to free it. 2411 | + */ 2412 | + list_add_tail(&slot->slot_list, &vma_slot_del); 2413 | + } else { 2414 | + /** 2415 | + * It's still on new list. It's ok to free slot directly. 2416 | + */ 2417 | + list_del(&slot->slot_list); 2418 | + free_vma_slot(slot); 2419 | + } 2420 | +out: 2421 | + vma->uksm_vma_slot = NULL; 2422 | + spin_unlock(&vma_slot_list_lock); 2423 | +} 2424 | + 2425 | +/** 2426 | + * Need to do two things: 2427 | + * 1. check if slot was moved to del list 2428 | + * 2. make sure the mmap_sem is manipulated under valid vma. 2429 | + * 2430 | + * My concern here is that in some cases, this may make 2431 | + * vma_slot_list_lock() waiters to serialized further by some 2432 | + * sem->wait_lock, can this really be expensive? 2433 | + * 2434 | + * 2435 | + * @return 2436 | + * 0: if successfully locked mmap_sem 2437 | + * -ENOENT: this slot was moved to del list 2438 | + * -EBUSY: vma lock failed 2439 | + */ 2440 | +static int try_down_read_slot_mmap_sem(struct vma_slot *slot) 2441 | +{ 2442 | + struct vm_area_struct *vma; 2443 | + struct mm_struct *mm; 2444 | + struct rw_semaphore *sem; 2445 | + 2446 | + spin_lock(&vma_slot_list_lock); 2447 | + 2448 | + /* the slot_list was removed and inited from new list, when it enters 2449 | + * uksm_list. If now it's not empty, then it must be moved to del list 2450 | + */ 2451 | + if (!slot_in_uksm(slot)) { 2452 | + spin_unlock(&vma_slot_list_lock); 2453 | + return -ENOENT; 2454 | + } 2455 | + 2456 | + BUG_ON(slot->pages != vma_pages(slot->vma)); 2457 | + /* Ok, vma still valid */ 2458 | + vma = slot->vma; 2459 | + mm = vma->vm_mm; 2460 | + sem = &mm->mmap_sem; 2461 | + 2462 | + if (uksm_test_exit(mm)) { 2463 | + spin_unlock(&vma_slot_list_lock); 2464 | + return -ENOENT; 2465 | + } 2466 | + 2467 | + if (down_read_trylock(sem)) { 2468 | + spin_unlock(&vma_slot_list_lock); 2469 | + if (slot_pool_alloc(slot)) { 2470 | + uksm_remove_vma(vma); 2471 | + up_read(sem); 2472 | + return -ENOENT; 2473 | + } 2474 | + return 0; 2475 | + } 2476 | + 2477 | + spin_unlock(&vma_slot_list_lock); 2478 | + return -EBUSY; 2479 | +} 2480 | + 2481 | +static inline unsigned long 2482 | +vma_page_address(struct page *page, struct vm_area_struct *vma) 2483 | +{ 2484 | + pgoff_t pgoff = page->index; 2485 | + unsigned long address; 2486 | + 2487 | + address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 2488 | + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { 2489 | + /* page should be within @vma mapping range */ 2490 | + return -EFAULT; 2491 | + } 2492 | + return address; 2493 | +} 2494 | + 2495 | + 2496 | +/* return 0 on success with the item's mmap_sem locked */ 2497 | +static inline int get_mergeable_page_lock_mmap(struct rmap_item *item) 2498 | +{ 2499 | + struct mm_struct *mm; 2500 | + struct vma_slot *slot = item->slot; 2501 | + int err = -EINVAL; 2502 | + 2503 | + struct page *page; 2504 | + 2505 | + /* 2506 | + * try_down_read_slot_mmap_sem() returns non-zero if the slot 2507 | + * has been removed by uksm_remove_vma(). 2508 | + */ 2509 | + if (try_down_read_slot_mmap_sem(slot)) 2510 | + return -EBUSY; 2511 | + 2512 | + mm = slot->vma->vm_mm; 2513 | + 2514 | + if (uksm_test_exit(mm)) 2515 | + goto failout_up; 2516 | + 2517 | + page = item->page; 2518 | + rcu_read_lock(); 2519 | + if (!get_page_unless_zero(page)) { 2520 | + rcu_read_unlock(); 2521 | + goto failout_up; 2522 | + } 2523 | + 2524 | + /* No need to consider huge page here. */ 2525 | + if (item->slot->vma->anon_vma != page_anon_vma(page) || 2526 | + vma_page_address(page, item->slot->vma) != get_rmap_addr(item)) { 2527 | + /* 2528 | + * TODO: 2529 | + * should we release this item becase of its stale page 2530 | + * mapping? 2531 | + */ 2532 | + put_page(page); 2533 | + rcu_read_unlock(); 2534 | + goto failout_up; 2535 | + } 2536 | + rcu_read_unlock(); 2537 | + return 0; 2538 | + 2539 | +failout_up: 2540 | + up_read(&mm->mmap_sem); 2541 | + return err; 2542 | +} 2543 | + 2544 | +/* 2545 | + * What kind of VMA is considered ? 2546 | + */ 2547 | +static inline int vma_can_enter(struct vm_area_struct *vma) 2548 | +{ 2549 | + return uksm_flags_can_scan(vma->vm_flags); 2550 | +} 2551 | + 2552 | +/* 2553 | + * Called whenever a fresh new vma is created A new vma_slot. 2554 | + * is created and inserted into a global list Must be called. 2555 | + * after vma is inserted to its mm . 2556 | + */ 2557 | +void uksm_vma_add_new(struct vm_area_struct *vma) 2558 | +{ 2559 | + struct vma_slot *slot; 2560 | + 2561 | + if (!vma_can_enter(vma)) { 2562 | + vma->uksm_vma_slot = NULL; 2563 | + return; 2564 | + } 2565 | + 2566 | + slot = alloc_vma_slot(); 2567 | + if (!slot) { 2568 | + vma->uksm_vma_slot = NULL; 2569 | + return; 2570 | + } 2571 | + 2572 | + vma->uksm_vma_slot = slot; 2573 | + vma->vm_flags |= VM_MERGEABLE; 2574 | + slot->vma = vma; 2575 | + slot->mm = vma->vm_mm; 2576 | + slot->ctime_j = jiffies; 2577 | + slot->pages = vma_pages(vma); 2578 | + spin_lock(&vma_slot_list_lock); 2579 | + list_add_tail(&slot->slot_list, &vma_slot_new); 2580 | + spin_unlock(&vma_slot_list_lock); 2581 | +} 2582 | + 2583 | +/* 32/3 < they < 32/2 */ 2584 | +#define shiftl 8 2585 | +#define shiftr 12 2586 | + 2587 | +#define HASH_FROM_TO(from, to) \ 2588 | +for (index = from; index < to; index++) { \ 2589 | + pos = random_nums[index]; \ 2590 | + hash += key[pos]; \ 2591 | + hash += (hash << shiftl); \ 2592 | + hash ^= (hash >> shiftr); \ 2593 | +} 2594 | + 2595 | + 2596 | +#define HASH_FROM_DOWN_TO(from, to) \ 2597 | +for (index = from - 1; index >= to; index--) { \ 2598 | + hash ^= (hash >> shiftr); \ 2599 | + hash ^= (hash >> (shiftr*2)); \ 2600 | + hash -= (hash << shiftl); \ 2601 | + hash += (hash << (shiftl*2)); \ 2602 | + pos = random_nums[index]; \ 2603 | + hash -= key[pos]; \ 2604 | +} 2605 | + 2606 | +/* 2607 | + * The main random sample hash function. 2608 | + */ 2609 | +static u32 random_sample_hash(void *addr, u32 hash_strength) 2610 | +{ 2611 | + u32 hash = 0xdeadbeef; 2612 | + int index, pos, loop = hash_strength; 2613 | + u32 *key = (u32 *)addr; 2614 | + 2615 | + if (loop > HASH_STRENGTH_FULL) 2616 | + loop = HASH_STRENGTH_FULL; 2617 | + 2618 | + HASH_FROM_TO(0, loop); 2619 | + 2620 | + if (hash_strength > HASH_STRENGTH_FULL) { 2621 | + loop = hash_strength - HASH_STRENGTH_FULL; 2622 | + HASH_FROM_TO(0, loop); 2623 | + } 2624 | + 2625 | + return hash; 2626 | +} 2627 | + 2628 | + 2629 | +/** 2630 | + * It's used when hash strength is adjusted 2631 | + * 2632 | + * @addr The page's virtual address 2633 | + * @from The original hash strength 2634 | + * @to The hash strength changed to 2635 | + * @hash The hash value generated with "from" hash value 2636 | + * 2637 | + * return the hash value 2638 | + */ 2639 | +static u32 delta_hash(void *addr, int from, int to, u32 hash) 2640 | +{ 2641 | + u32 *key = (u32 *)addr; 2642 | + int index, pos; /* make sure they are int type */ 2643 | + 2644 | + if (to > from) { 2645 | + if (from >= HASH_STRENGTH_FULL) { 2646 | + from -= HASH_STRENGTH_FULL; 2647 | + to -= HASH_STRENGTH_FULL; 2648 | + HASH_FROM_TO(from, to); 2649 | + } else if (to <= HASH_STRENGTH_FULL) { 2650 | + HASH_FROM_TO(from, to); 2651 | + } else { 2652 | + HASH_FROM_TO(from, HASH_STRENGTH_FULL); 2653 | + HASH_FROM_TO(0, to - HASH_STRENGTH_FULL); 2654 | + } 2655 | + } else { 2656 | + if (from <= HASH_STRENGTH_FULL) { 2657 | + HASH_FROM_DOWN_TO(from, to); 2658 | + } else if (to >= HASH_STRENGTH_FULL) { 2659 | + from -= HASH_STRENGTH_FULL; 2660 | + to -= HASH_STRENGTH_FULL; 2661 | + HASH_FROM_DOWN_TO(from, to); 2662 | + } else { 2663 | + HASH_FROM_DOWN_TO(from - HASH_STRENGTH_FULL, 0); 2664 | + HASH_FROM_DOWN_TO(HASH_STRENGTH_FULL, to); 2665 | + } 2666 | + } 2667 | + 2668 | + return hash; 2669 | +} 2670 | + 2671 | +/** 2672 | + * 2673 | + * Called when: rshash_pos or rshash_neg is about to overflow or a scan round 2674 | + * has finished. 2675 | + * 2676 | + * return 0 if no page has been scanned since last call, 1 otherwise. 2677 | + */ 2678 | +static inline int encode_benefit(void) 2679 | +{ 2680 | + u64 scanned_delta, pos_delta, neg_delta; 2681 | + unsigned long base = benefit.base; 2682 | + 2683 | + scanned_delta = uksm_pages_scanned - uksm_pages_scanned_last; 2684 | + 2685 | + if (!scanned_delta) 2686 | + return 0; 2687 | + 2688 | + scanned_delta >>= base; 2689 | + pos_delta = rshash_pos >> base; 2690 | + neg_delta = rshash_neg >> base; 2691 | + 2692 | + if (CAN_OVERFLOW_U64(benefit.pos, pos_delta) || 2693 | + CAN_OVERFLOW_U64(benefit.neg, neg_delta) || 2694 | + CAN_OVERFLOW_U64(benefit.scanned, scanned_delta)) { 2695 | + benefit.scanned >>= 1; 2696 | + benefit.neg >>= 1; 2697 | + benefit.pos >>= 1; 2698 | + benefit.base++; 2699 | + scanned_delta >>= 1; 2700 | + pos_delta >>= 1; 2701 | + neg_delta >>= 1; 2702 | + } 2703 | + 2704 | + benefit.pos += pos_delta; 2705 | + benefit.neg += neg_delta; 2706 | + benefit.scanned += scanned_delta; 2707 | + 2708 | + BUG_ON(!benefit.scanned); 2709 | + 2710 | + rshash_pos = rshash_neg = 0; 2711 | + uksm_pages_scanned_last = uksm_pages_scanned; 2712 | + 2713 | + return 1; 2714 | +} 2715 | + 2716 | +static inline void reset_benefit(void) 2717 | +{ 2718 | + benefit.pos = 0; 2719 | + benefit.neg = 0; 2720 | + benefit.base = 0; 2721 | + benefit.scanned = 0; 2722 | +} 2723 | + 2724 | +static inline void inc_rshash_pos(unsigned long delta) 2725 | +{ 2726 | + if (CAN_OVERFLOW_U64(rshash_pos, delta)) 2727 | + encode_benefit(); 2728 | + 2729 | + rshash_pos += delta; 2730 | +} 2731 | + 2732 | +static inline void inc_rshash_neg(unsigned long delta) 2733 | +{ 2734 | + if (CAN_OVERFLOW_U64(rshash_neg, delta)) 2735 | + encode_benefit(); 2736 | + 2737 | + rshash_neg += delta; 2738 | +} 2739 | + 2740 | + 2741 | +static inline u32 page_hash(struct page *page, unsigned long hash_strength, 2742 | + int cost_accounting) 2743 | +{ 2744 | + u32 val; 2745 | + unsigned long delta; 2746 | + 2747 | + void *addr = kmap_atomic(page); 2748 | + 2749 | + val = random_sample_hash(addr, hash_strength); 2750 | + kunmap_atomic(addr); 2751 | + 2752 | + if (cost_accounting) { 2753 | + if (HASH_STRENGTH_FULL > hash_strength) 2754 | + delta = HASH_STRENGTH_FULL - hash_strength; 2755 | + else 2756 | + delta = 0; 2757 | + 2758 | + inc_rshash_pos(delta); 2759 | + } 2760 | + 2761 | + return val; 2762 | +} 2763 | + 2764 | +static int memcmp_pages(struct page *page1, struct page *page2, 2765 | + int cost_accounting) 2766 | +{ 2767 | + char *addr1, *addr2; 2768 | + int ret; 2769 | + 2770 | + addr1 = kmap_atomic(page1); 2771 | + addr2 = kmap_atomic(page2); 2772 | + ret = memcmp(addr1, addr2, PAGE_SIZE); 2773 | + kunmap_atomic(addr2); 2774 | + kunmap_atomic(addr1); 2775 | + 2776 | + if (cost_accounting) 2777 | + inc_rshash_neg(memcmp_cost); 2778 | + 2779 | + return ret; 2780 | +} 2781 | + 2782 | +static inline int pages_identical(struct page *page1, struct page *page2) 2783 | +{ 2784 | + return !memcmp_pages(page1, page2, 0); 2785 | +} 2786 | + 2787 | +static inline int is_page_full_zero(struct page *page) 2788 | +{ 2789 | + char *addr; 2790 | + int ret; 2791 | + 2792 | + addr = kmap_atomic(page); 2793 | + ret = is_full_zero(addr, PAGE_SIZE); 2794 | + kunmap_atomic(addr); 2795 | + 2796 | + return ret; 2797 | +} 2798 | + 2799 | +static int write_protect_page(struct vm_area_struct *vma, struct page *page, 2800 | + pte_t *orig_pte, pte_t *old_pte) 2801 | +{ 2802 | + struct mm_struct *mm = vma->vm_mm; 2803 | + unsigned long addr; 2804 | + pte_t *ptep; 2805 | + spinlock_t *ptl; 2806 | + int swapped; 2807 | + int err = -EFAULT; 2808 | + unsigned long mmun_start; /* For mmu_notifiers */ 2809 | + unsigned long mmun_end; /* For mmu_notifiers */ 2810 | + 2811 | + addr = page_address_in_vma(page, vma); 2812 | + if (addr == -EFAULT) 2813 | + goto out; 2814 | + 2815 | + BUG_ON(PageTransCompound(page)); 2816 | + 2817 | + mmun_start = addr; 2818 | + mmun_end = addr + PAGE_SIZE; 2819 | + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 2820 | + 2821 | + ptep = page_check_address(page, mm, addr, &ptl, 0); 2822 | + if (!ptep) 2823 | + goto out_mn; 2824 | + 2825 | + if (old_pte) 2826 | + *old_pte = *ptep; 2827 | + 2828 | + if (pte_write(*ptep) || pte_dirty(*ptep)) { 2829 | + pte_t entry; 2830 | + 2831 | + swapped = PageSwapCache(page); 2832 | + flush_cache_page(vma, addr, page_to_pfn(page)); 2833 | + /* 2834 | + * Ok this is tricky, when get_user_pages_fast() run it doesnt 2835 | + * take any lock, therefore the check that we are going to make 2836 | + * with the pagecount against the mapcount is racey and 2837 | + * O_DIRECT can happen right after the check. 2838 | + * So we clear the pte and flush the tlb before the check 2839 | + * this assure us that no O_DIRECT can happen after the check 2840 | + * or in the middle of the check. 2841 | + */ 2842 | + entry = ptep_clear_flush_notify(vma, addr, ptep); 2843 | + /* 2844 | + * Check that no O_DIRECT or similar I/O is in progress on the 2845 | + * page 2846 | + */ 2847 | + if (page_mapcount(page) + 1 + swapped != page_count(page)) { 2848 | + set_pte_at(mm, addr, ptep, entry); 2849 | + goto out_unlock; 2850 | + } 2851 | + if (pte_dirty(entry)) 2852 | + set_page_dirty(page); 2853 | + entry = pte_mkclean(pte_wrprotect(entry)); 2854 | + set_pte_at_notify(mm, addr, ptep, entry); 2855 | + } 2856 | + *orig_pte = *ptep; 2857 | + err = 0; 2858 | + 2859 | +out_unlock: 2860 | + pte_unmap_unlock(ptep, ptl); 2861 | +out_mn: 2862 | + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2863 | +out: 2864 | + return err; 2865 | +} 2866 | + 2867 | +#define MERGE_ERR_PGERR 1 /* the page is invalid cannot continue */ 2868 | +#define MERGE_ERR_COLLI 2 /* there is a collision */ 2869 | +#define MERGE_ERR_COLLI_MAX 3 /* collision at the max hash strength */ 2870 | +#define MERGE_ERR_CHANGED 4 /* the page has changed since last hash */ 2871 | + 2872 | + 2873 | +/** 2874 | + * replace_page - replace page in vma by new ksm page 2875 | + * @vma: vma that holds the pte pointing to page 2876 | + * @page: the page we are replacing by kpage 2877 | + * @kpage: the ksm page we replace page by 2878 | + * @orig_pte: the original value of the pte 2879 | + * 2880 | + * Returns 0 on success, MERGE_ERR_PGERR on failure. 2881 | + */ 2882 | +static int replace_page(struct vm_area_struct *vma, struct page *page, 2883 | + struct page *kpage, pte_t orig_pte) 2884 | +{ 2885 | + struct mm_struct *mm = vma->vm_mm; 2886 | + pgd_t *pgd; 2887 | + pud_t *pud; 2888 | + pmd_t *pmd; 2889 | + pte_t *ptep; 2890 | + spinlock_t *ptl; 2891 | + pte_t entry; 2892 | + 2893 | + unsigned long addr; 2894 | + int err = MERGE_ERR_PGERR; 2895 | + unsigned long mmun_start; /* For mmu_notifiers */ 2896 | + unsigned long mmun_end; /* For mmu_notifiers */ 2897 | + 2898 | + addr = page_address_in_vma(page, vma); 2899 | + if (addr == -EFAULT) 2900 | + goto out; 2901 | + 2902 | + pgd = pgd_offset(mm, addr); 2903 | + if (!pgd_present(*pgd)) 2904 | + goto out; 2905 | + 2906 | + pud = pud_offset(pgd, addr); 2907 | + if (!pud_present(*pud)) 2908 | + goto out; 2909 | + 2910 | + pmd = pmd_offset(pud, addr); 2911 | + BUG_ON(pmd_trans_huge(*pmd)); 2912 | + if (!pmd_present(*pmd)) 2913 | + goto out; 2914 | + 2915 | + mmun_start = addr; 2916 | + mmun_end = addr + PAGE_SIZE; 2917 | + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 2918 | + 2919 | + ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); 2920 | + if (!pte_same(*ptep, orig_pte)) { 2921 | + pte_unmap_unlock(ptep, ptl); 2922 | + goto out_mn; 2923 | + } 2924 | + 2925 | + flush_cache_page(vma, addr, pte_pfn(*ptep)); 2926 | + ptep_clear_flush_notify(vma, addr, ptep); 2927 | + entry = mk_pte(kpage, vma->vm_page_prot); 2928 | + 2929 | + /* special treatment is needed for zero_page */ 2930 | + if ((page_to_pfn(kpage) == uksm_zero_pfn) || 2931 | + (page_to_pfn(kpage) == zero_pfn)) { 2932 | + entry = pte_mkspecial(entry); 2933 | + dec_mm_counter(mm, MM_ANONPAGES); 2934 | + } else { 2935 | + get_page(kpage); 2936 | + page_add_anon_rmap(kpage, vma, addr, false); 2937 | + } 2938 | + 2939 | + set_pte_at_notify(mm, addr, ptep, entry); 2940 | + 2941 | + page_remove_rmap(page, false); 2942 | + if (!page_mapped(page)) 2943 | + try_to_free_swap(page); 2944 | + put_page(page); 2945 | + 2946 | + pte_unmap_unlock(ptep, ptl); 2947 | + err = 0; 2948 | +out_mn: 2949 | + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2950 | +out: 2951 | + return err; 2952 | +} 2953 | + 2954 | + 2955 | +/** 2956 | + * Fully hash a page with HASH_STRENGTH_MAX return a non-zero hash value. The 2957 | + * zero hash value at HASH_STRENGTH_MAX is used to indicated that its 2958 | + * hash_max member has not been calculated. 2959 | + * 2960 | + * @page The page needs to be hashed 2961 | + * @hash_old The hash value calculated with current hash strength 2962 | + * 2963 | + * return the new hash value calculated at HASH_STRENGTH_MAX 2964 | + */ 2965 | +static inline u32 page_hash_max(struct page *page, u32 hash_old) 2966 | +{ 2967 | + u32 hash_max = 0; 2968 | + void *addr; 2969 | + 2970 | + addr = kmap_atomic(page); 2971 | + hash_max = delta_hash(addr, hash_strength, 2972 | + HASH_STRENGTH_MAX, hash_old); 2973 | + 2974 | + kunmap_atomic(addr); 2975 | + 2976 | + if (!hash_max) 2977 | + hash_max = 1; 2978 | + 2979 | + inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength); 2980 | + return hash_max; 2981 | +} 2982 | + 2983 | +/* 2984 | + * We compare the hash again, to ensure that it is really a hash collision 2985 | + * instead of being caused by page write. 2986 | + */ 2987 | +static inline int check_collision(struct rmap_item *rmap_item, 2988 | + u32 hash) 2989 | +{ 2990 | + int err; 2991 | + struct page *page = rmap_item->page; 2992 | + 2993 | + /* if this rmap_item has already been hash_maxed, then the collision 2994 | + * must appears in the second-level rbtree search. In this case we check 2995 | + * if its hash_max value has been changed. Otherwise, the collision 2996 | + * happens in the first-level rbtree search, so we check against it's 2997 | + * current hash value. 2998 | + */ 2999 | + if (rmap_item->hash_max) { 3000 | + inc_rshash_neg(memcmp_cost); 3001 | + inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength); 3002 | + 3003 | + if (rmap_item->hash_max == page_hash_max(page, hash)) 3004 | + err = MERGE_ERR_COLLI; 3005 | + else 3006 | + err = MERGE_ERR_CHANGED; 3007 | + } else { 3008 | + inc_rshash_neg(memcmp_cost + hash_strength); 3009 | + 3010 | + if (page_hash(page, hash_strength, 0) == hash) 3011 | + err = MERGE_ERR_COLLI; 3012 | + else 3013 | + err = MERGE_ERR_CHANGED; 3014 | + } 3015 | + 3016 | + return err; 3017 | +} 3018 | + 3019 | +/** 3020 | + * Try to merge a rmap_item.page with a kpage in stable node. kpage must 3021 | + * already be a ksm page. 3022 | + * 3023 | + * @return 0 if the pages were merged, -EFAULT otherwise. 3024 | + */ 3025 | +static int try_to_merge_with_uksm_page(struct rmap_item *rmap_item, 3026 | + struct page *kpage, u32 hash) 3027 | +{ 3028 | + struct vm_area_struct *vma = rmap_item->slot->vma; 3029 | + struct mm_struct *mm = vma->vm_mm; 3030 | + pte_t orig_pte = __pte(0); 3031 | + int err = MERGE_ERR_PGERR; 3032 | + struct page *page; 3033 | + 3034 | + if (uksm_test_exit(mm)) 3035 | + goto out; 3036 | + 3037 | + page = rmap_item->page; 3038 | + 3039 | + if (page == kpage) { /* ksm page forked */ 3040 | + err = 0; 3041 | + goto out; 3042 | + } 3043 | + 3044 | + /* 3045 | + * We need the page lock to read a stable PageSwapCache in 3046 | + * write_protect_page(). We use trylock_page() instead of 3047 | + * lock_page() because we don't want to wait here - we 3048 | + * prefer to continue scanning and merging different pages, 3049 | + * then come back to this page when it is unlocked. 3050 | + */ 3051 | + if (!trylock_page(page)) 3052 | + goto out; 3053 | + 3054 | + if (!PageAnon(page) || !PageKsm(kpage)) 3055 | + goto out_unlock; 3056 | + 3057 | + if (PageTransCompound(page)) { 3058 | + err = split_huge_page(page); 3059 | + if (err) 3060 | + goto out_unlock; 3061 | + } 3062 | + 3063 | + /* 3064 | + * If this anonymous page is mapped only here, its pte may need 3065 | + * to be write-protected. If it's mapped elsewhere, all of its 3066 | + * ptes are necessarily already write-protected. But in either 3067 | + * case, we need to lock and check page_count is not raised. 3068 | + */ 3069 | + if (write_protect_page(vma, page, &orig_pte, NULL) == 0) { 3070 | + if (pages_identical(page, kpage)) 3071 | + err = replace_page(vma, page, kpage, orig_pte); 3072 | + else 3073 | + err = check_collision(rmap_item, hash); 3074 | + } 3075 | + 3076 | + if ((vma->vm_flags & VM_LOCKED) && kpage && !err) { 3077 | + munlock_vma_page(page); 3078 | + if (!PageMlocked(kpage)) { 3079 | + unlock_page(page); 3080 | + lock_page(kpage); 3081 | + mlock_vma_page(kpage); 3082 | + page = kpage; /* for final unlock */ 3083 | + } 3084 | + } 3085 | + 3086 | +out_unlock: 3087 | + unlock_page(page); 3088 | +out: 3089 | + return err; 3090 | +} 3091 | + 3092 | + 3093 | + 3094 | +/** 3095 | + * If two pages fail to merge in try_to_merge_two_pages, then we have a chance 3096 | + * to restore a page mapping that has been changed in try_to_merge_two_pages. 3097 | + * 3098 | + * @return 0 on success. 3099 | + */ 3100 | +static int restore_uksm_page_pte(struct vm_area_struct *vma, unsigned long addr, 3101 | + pte_t orig_pte, pte_t wprt_pte) 3102 | +{ 3103 | + struct mm_struct *mm = vma->vm_mm; 3104 | + pgd_t *pgd; 3105 | + pud_t *pud; 3106 | + pmd_t *pmd; 3107 | + pte_t *ptep; 3108 | + spinlock_t *ptl; 3109 | + 3110 | + int err = -EFAULT; 3111 | + 3112 | + pgd = pgd_offset(mm, addr); 3113 | + if (!pgd_present(*pgd)) 3114 | + goto out; 3115 | + 3116 | + pud = pud_offset(pgd, addr); 3117 | + if (!pud_present(*pud)) 3118 | + goto out; 3119 | + 3120 | + pmd = pmd_offset(pud, addr); 3121 | + if (!pmd_present(*pmd)) 3122 | + goto out; 3123 | + 3124 | + ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); 3125 | + if (!pte_same(*ptep, wprt_pte)) { 3126 | + /* already copied, let it be */ 3127 | + pte_unmap_unlock(ptep, ptl); 3128 | + goto out; 3129 | + } 3130 | + 3131 | + /* 3132 | + * Good boy, still here. When we still get the ksm page, it does not 3133 | + * return to the free page pool, there is no way that a pte was changed 3134 | + * to other page and gets back to this page. And remind that ksm page 3135 | + * do not reuse in do_wp_page(). So it's safe to restore the original 3136 | + * pte. 3137 | + */ 3138 | + flush_cache_page(vma, addr, pte_pfn(*ptep)); 3139 | + ptep_clear_flush_notify(vma, addr, ptep); 3140 | + set_pte_at_notify(mm, addr, ptep, orig_pte); 3141 | + 3142 | + pte_unmap_unlock(ptep, ptl); 3143 | + err = 0; 3144 | +out: 3145 | + return err; 3146 | +} 3147 | + 3148 | +/** 3149 | + * try_to_merge_two_pages() - take two identical pages and prepare 3150 | + * them to be merged into one page(rmap_item->page) 3151 | + * 3152 | + * @return 0 if we successfully merged two identical pages into 3153 | + * one ksm page. MERGE_ERR_COLLI if it's only a hash collision 3154 | + * search in rbtree. MERGE_ERR_CHANGED if rmap_item has been 3155 | + * changed since it's hashed. MERGE_ERR_PGERR otherwise. 3156 | + * 3157 | + */ 3158 | +static int try_to_merge_two_pages(struct rmap_item *rmap_item, 3159 | + struct rmap_item *tree_rmap_item, 3160 | + u32 hash) 3161 | +{ 3162 | + pte_t orig_pte1 = __pte(0), orig_pte2 = __pte(0); 3163 | + pte_t wprt_pte1 = __pte(0), wprt_pte2 = __pte(0); 3164 | + struct vm_area_struct *vma1 = rmap_item->slot->vma; 3165 | + struct vm_area_struct *vma2 = tree_rmap_item->slot->vma; 3166 | + struct page *page = rmap_item->page; 3167 | + struct page *tree_page = tree_rmap_item->page; 3168 | + int err = MERGE_ERR_PGERR; 3169 | + struct address_space *saved_mapping; 3170 | + 3171 | + 3172 | + if (rmap_item->page == tree_rmap_item->page) 3173 | + goto out; 3174 | + 3175 | + if (!trylock_page(page)) 3176 | + goto out; 3177 | + 3178 | + if (!PageAnon(page)) 3179 | + goto out_unlock; 3180 | + 3181 | + if (PageTransCompound(page)) { 3182 | + err = split_huge_page(page); 3183 | + if (err) 3184 | + goto out_unlock; 3185 | + } 3186 | + 3187 | + if (write_protect_page(vma1, page, &wprt_pte1, &orig_pte1) != 0) { 3188 | + unlock_page(page); 3189 | + goto out; 3190 | + } 3191 | + 3192 | + /* 3193 | + * While we hold page lock, upgrade page from 3194 | + * PageAnon+anon_vma to PageKsm+NULL stable_node: 3195 | + * stable_tree_insert() will update stable_node. 3196 | + */ 3197 | + saved_mapping = page->mapping; 3198 | + set_page_stable_node(page, NULL); 3199 | + mark_page_accessed(page); 3200 | + if (!PageDirty(page)) 3201 | + SetPageDirty(page); 3202 | + 3203 | + unlock_page(page); 3204 | + 3205 | + if (!trylock_page(tree_page)) 3206 | + goto restore_out; 3207 | + 3208 | + if (!PageAnon(tree_page)) { 3209 | + unlock_page(tree_page); 3210 | + goto restore_out; 3211 | + } 3212 | + 3213 | + if (PageTransCompound(tree_page)) { 3214 | + err = split_huge_page(tree_page); 3215 | + if (err) { 3216 | + unlock_page(tree_page); 3217 | + goto restore_out; 3218 | + } 3219 | + } 3220 | + 3221 | + if (write_protect_page(vma2, tree_page, &wprt_pte2, &orig_pte2) != 0) { 3222 | + unlock_page(tree_page); 3223 | + goto restore_out; 3224 | + } 3225 | + 3226 | + if (pages_identical(page, tree_page)) { 3227 | + err = replace_page(vma2, tree_page, page, wprt_pte2); 3228 | + if (err) { 3229 | + unlock_page(tree_page); 3230 | + goto restore_out; 3231 | + } 3232 | + 3233 | + if ((vma2->vm_flags & VM_LOCKED)) { 3234 | + munlock_vma_page(tree_page); 3235 | + if (!PageMlocked(page)) { 3236 | + unlock_page(tree_page); 3237 | + lock_page(page); 3238 | + mlock_vma_page(page); 3239 | + tree_page = page; /* for final unlock */ 3240 | + } 3241 | + } 3242 | + 3243 | + unlock_page(tree_page); 3244 | + 3245 | + goto out; /* success */ 3246 | + 3247 | + } else { 3248 | + if (tree_rmap_item->hash_max && 3249 | + tree_rmap_item->hash_max == rmap_item->hash_max) { 3250 | + err = MERGE_ERR_COLLI_MAX; 3251 | + } else if (page_hash(page, hash_strength, 0) == 3252 | + page_hash(tree_page, hash_strength, 0)) { 3253 | + inc_rshash_neg(memcmp_cost + hash_strength * 2); 3254 | + err = MERGE_ERR_COLLI; 3255 | + } else { 3256 | + err = MERGE_ERR_CHANGED; 3257 | + } 3258 | + 3259 | + unlock_page(tree_page); 3260 | + } 3261 | + 3262 | +restore_out: 3263 | + lock_page(page); 3264 | + if (!restore_uksm_page_pte(vma1, get_rmap_addr(rmap_item), 3265 | + orig_pte1, wprt_pte1)) 3266 | + page->mapping = saved_mapping; 3267 | + 3268 | +out_unlock: 3269 | + unlock_page(page); 3270 | +out: 3271 | + return err; 3272 | +} 3273 | + 3274 | +static inline int hash_cmp(u32 new_val, u32 node_val) 3275 | +{ 3276 | + if (new_val > node_val) 3277 | + return 1; 3278 | + else if (new_val < node_val) 3279 | + return -1; 3280 | + else 3281 | + return 0; 3282 | +} 3283 | + 3284 | +static inline u32 rmap_item_hash_max(struct rmap_item *item, u32 hash) 3285 | +{ 3286 | + u32 hash_max = item->hash_max; 3287 | + 3288 | + if (!hash_max) { 3289 | + hash_max = page_hash_max(item->page, hash); 3290 | + 3291 | + item->hash_max = hash_max; 3292 | + } 3293 | + 3294 | + return hash_max; 3295 | +} 3296 | + 3297 | + 3298 | + 3299 | +/** 3300 | + * stable_tree_search() - search the stable tree for a page 3301 | + * 3302 | + * @item: the rmap_item we are comparing with 3303 | + * @hash: the hash value of this item->page already calculated 3304 | + * 3305 | + * @return the page we have found, NULL otherwise. The page returned has 3306 | + * been gotten. 3307 | + */ 3308 | +static struct page *stable_tree_search(struct rmap_item *item, u32 hash) 3309 | +{ 3310 | + struct rb_node *node = root_stable_treep->rb_node; 3311 | + struct tree_node *tree_node; 3312 | + unsigned long hash_max; 3313 | + struct page *page = item->page; 3314 | + struct stable_node *stable_node; 3315 | + 3316 | + stable_node = page_stable_node(page); 3317 | + if (stable_node) { 3318 | + /* ksm page forked, that is 3319 | + * if (PageKsm(page) && !in_stable_tree(rmap_item)) 3320 | + * it's actually gotten once outside. 3321 | + */ 3322 | + get_page(page); 3323 | + return page; 3324 | + } 3325 | + 3326 | + while (node) { 3327 | + int cmp; 3328 | + 3329 | + tree_node = rb_entry(node, struct tree_node, node); 3330 | + 3331 | + cmp = hash_cmp(hash, tree_node->hash); 3332 | + 3333 | + if (cmp < 0) 3334 | + node = node->rb_left; 3335 | + else if (cmp > 0) 3336 | + node = node->rb_right; 3337 | + else 3338 | + break; 3339 | + } 3340 | + 3341 | + if (!node) 3342 | + return NULL; 3343 | + 3344 | + if (tree_node->count == 1) { 3345 | + stable_node = rb_entry(tree_node->sub_root.rb_node, 3346 | + struct stable_node, node); 3347 | + BUG_ON(!stable_node); 3348 | + 3349 | + goto get_page_out; 3350 | + } 3351 | + 3352 | + /* 3353 | + * ok, we have to search the second 3354 | + * level subtree, hash the page to a 3355 | + * full strength. 3356 | + */ 3357 | + node = tree_node->sub_root.rb_node; 3358 | + BUG_ON(!node); 3359 | + hash_max = rmap_item_hash_max(item, hash); 3360 | + 3361 | + while (node) { 3362 | + int cmp; 3363 | + 3364 | + stable_node = rb_entry(node, struct stable_node, node); 3365 | + 3366 | + cmp = hash_cmp(hash_max, stable_node->hash_max); 3367 | + 3368 | + if (cmp < 0) 3369 | + node = node->rb_left; 3370 | + else if (cmp > 0) 3371 | + node = node->rb_right; 3372 | + else 3373 | + goto get_page_out; 3374 | + } 3375 | + 3376 | + return NULL; 3377 | + 3378 | +get_page_out: 3379 | + page = get_uksm_page(stable_node, 1, 1); 3380 | + return page; 3381 | +} 3382 | + 3383 | +static int try_merge_rmap_item(struct rmap_item *item, 3384 | + struct page *kpage, 3385 | + struct page *tree_page) 3386 | +{ 3387 | + spinlock_t *ptl; 3388 | + pte_t *ptep; 3389 | + unsigned long addr; 3390 | + struct vm_area_struct *vma = item->slot->vma; 3391 | + 3392 | + addr = get_rmap_addr(item); 3393 | + ptep = page_check_address(kpage, vma->vm_mm, addr, &ptl, 0); 3394 | + if (!ptep) 3395 | + return 0; 3396 | + 3397 | + if (pte_write(*ptep)) { 3398 | + /* has changed, abort! */ 3399 | + pte_unmap_unlock(ptep, ptl); 3400 | + return 0; 3401 | + } 3402 | + 3403 | + get_page(tree_page); 3404 | + page_add_anon_rmap(tree_page, vma, addr, false); 3405 | + 3406 | + flush_cache_page(vma, addr, pte_pfn(*ptep)); 3407 | + ptep_clear_flush_notify(vma, addr, ptep); 3408 | + set_pte_at_notify(vma->vm_mm, addr, ptep, 3409 | + mk_pte(tree_page, vma->vm_page_prot)); 3410 | + 3411 | + page_remove_rmap(kpage, false); 3412 | + put_page(kpage); 3413 | + 3414 | + pte_unmap_unlock(ptep, ptl); 3415 | + 3416 | + return 1; 3417 | +} 3418 | + 3419 | +/** 3420 | + * try_to_merge_with_stable_page() - when two rmap_items need to be inserted 3421 | + * into stable tree, the page was found to be identical to a stable ksm page, 3422 | + * this is the last chance we can merge them into one. 3423 | + * 3424 | + * @item1: the rmap_item holding the page which we wanted to insert 3425 | + * into stable tree. 3426 | + * @item2: the other rmap_item we found when unstable tree search 3427 | + * @oldpage: the page currently mapped by the two rmap_items 3428 | + * @tree_page: the page we found identical in stable tree node 3429 | + * @success1: return if item1 is successfully merged 3430 | + * @success2: return if item2 is successfully merged 3431 | + */ 3432 | +static void try_merge_with_stable(struct rmap_item *item1, 3433 | + struct rmap_item *item2, 3434 | + struct page **kpage, 3435 | + struct page *tree_page, 3436 | + int *success1, int *success2) 3437 | +{ 3438 | + struct vm_area_struct *vma1 = item1->slot->vma; 3439 | + struct vm_area_struct *vma2 = item2->slot->vma; 3440 | + *success1 = 0; 3441 | + *success2 = 0; 3442 | + 3443 | + if (unlikely(*kpage == tree_page)) { 3444 | + /* I don't think this can really happen */ 3445 | + printk(KERN_WARNING "UKSM: unexpected condition detected in " 3446 | + "try_merge_with_stable() -- *kpage == tree_page !\n"); 3447 | + *success1 = 1; 3448 | + *success2 = 1; 3449 | + return; 3450 | + } 3451 | + 3452 | + if (!PageAnon(*kpage) || !PageKsm(*kpage)) 3453 | + goto failed; 3454 | + 3455 | + if (!trylock_page(tree_page)) 3456 | + goto failed; 3457 | + 3458 | + /* If the oldpage is still ksm and still pointed 3459 | + * to in the right place, and still write protected, 3460 | + * we are confident it's not changed, no need to 3461 | + * memcmp anymore. 3462 | + * be ware, we cannot take nested pte locks, 3463 | + * deadlock risk. 3464 | + */ 3465 | + if (!try_merge_rmap_item(item1, *kpage, tree_page)) 3466 | + goto unlock_failed; 3467 | + 3468 | + /* ok, then vma2, remind that pte1 already set */ 3469 | + if (!try_merge_rmap_item(item2, *kpage, tree_page)) 3470 | + goto success_1; 3471 | + 3472 | + *success2 = 1; 3473 | +success_1: 3474 | + *success1 = 1; 3475 | + 3476 | + 3477 | + if ((*success1 && vma1->vm_flags & VM_LOCKED) || 3478 | + (*success2 && vma2->vm_flags & VM_LOCKED)) { 3479 | + munlock_vma_page(*kpage); 3480 | + if (!PageMlocked(tree_page)) 3481 | + mlock_vma_page(tree_page); 3482 | + } 3483 | + 3484 | + /* 3485 | + * We do not need oldpage any more in the caller, so can break the lock 3486 | + * now. 3487 | + */ 3488 | + unlock_page(*kpage); 3489 | + *kpage = tree_page; /* Get unlocked outside. */ 3490 | + return; 3491 | + 3492 | +unlock_failed: 3493 | + unlock_page(tree_page); 3494 | +failed: 3495 | + return; 3496 | +} 3497 | + 3498 | +static inline void stable_node_hash_max(struct stable_node *node, 3499 | + struct page *page, u32 hash) 3500 | +{ 3501 | + u32 hash_max = node->hash_max; 3502 | + 3503 | + if (!hash_max) { 3504 | + hash_max = page_hash_max(page, hash); 3505 | + node->hash_max = hash_max; 3506 | + } 3507 | +} 3508 | + 3509 | +static inline 3510 | +struct stable_node *new_stable_node(struct tree_node *tree_node, 3511 | + struct page *kpage, u32 hash_max) 3512 | +{ 3513 | + struct stable_node *new_stable_node; 3514 | + 3515 | + new_stable_node = alloc_stable_node(); 3516 | + if (!new_stable_node) 3517 | + return NULL; 3518 | + 3519 | + new_stable_node->kpfn = page_to_pfn(kpage); 3520 | + new_stable_node->hash_max = hash_max; 3521 | + new_stable_node->tree_node = tree_node; 3522 | + set_page_stable_node(kpage, new_stable_node); 3523 | + 3524 | + return new_stable_node; 3525 | +} 3526 | + 3527 | +static inline 3528 | +struct stable_node *first_level_insert(struct tree_node *tree_node, 3529 | + struct rmap_item *rmap_item, 3530 | + struct rmap_item *tree_rmap_item, 3531 | + struct page **kpage, u32 hash, 3532 | + int *success1, int *success2) 3533 | +{ 3534 | + int cmp; 3535 | + struct page *tree_page; 3536 | + u32 hash_max = 0; 3537 | + struct stable_node *stable_node, *new_snode; 3538 | + struct rb_node *parent = NULL, **new; 3539 | + 3540 | + /* this tree node contains no sub-tree yet */ 3541 | + stable_node = rb_entry(tree_node->sub_root.rb_node, 3542 | + struct stable_node, node); 3543 | + 3544 | + tree_page = get_uksm_page(stable_node, 1, 0); 3545 | + if (tree_page) { 3546 | + cmp = memcmp_pages(*kpage, tree_page, 1); 3547 | + if (!cmp) { 3548 | + try_merge_with_stable(rmap_item, tree_rmap_item, kpage, 3549 | + tree_page, success1, success2); 3550 | + put_page(tree_page); 3551 | + if (!*success1 && !*success2) 3552 | + goto failed; 3553 | + 3554 | + return stable_node; 3555 | + 3556 | + } else { 3557 | + /* 3558 | + * collision in first level try to create a subtree. 3559 | + * A new node need to be created. 3560 | + */ 3561 | + put_page(tree_page); 3562 | + 3563 | + stable_node_hash_max(stable_node, tree_page, 3564 | + tree_node->hash); 3565 | + hash_max = rmap_item_hash_max(rmap_item, hash); 3566 | + cmp = hash_cmp(hash_max, stable_node->hash_max); 3567 | + 3568 | + parent = &stable_node->node; 3569 | + if (cmp < 0) { 3570 | + new = &parent->rb_left; 3571 | + } else if (cmp > 0) { 3572 | + new = &parent->rb_right; 3573 | + } else { 3574 | + goto failed; 3575 | + } 3576 | + } 3577 | + 3578 | + } else { 3579 | + /* the only stable_node deleted, we reuse its tree_node. 3580 | + */ 3581 | + parent = NULL; 3582 | + new = &tree_node->sub_root.rb_node; 3583 | + } 3584 | + 3585 | + new_snode = new_stable_node(tree_node, *kpage, hash_max); 3586 | + if (!new_snode) 3587 | + goto failed; 3588 | + 3589 | + rb_link_node(&new_snode->node, parent, new); 3590 | + rb_insert_color(&new_snode->node, &tree_node->sub_root); 3591 | + tree_node->count++; 3592 | + *success1 = *success2 = 1; 3593 | + 3594 | + return new_snode; 3595 | + 3596 | +failed: 3597 | + return NULL; 3598 | +} 3599 | + 3600 | +static inline 3601 | +struct stable_node *stable_subtree_insert(struct tree_node *tree_node, 3602 | + struct rmap_item *rmap_item, 3603 | + struct rmap_item *tree_rmap_item, 3604 | + struct page **kpage, u32 hash, 3605 | + int *success1, int *success2) 3606 | +{ 3607 | + struct page *tree_page; 3608 | + u32 hash_max; 3609 | + struct stable_node *stable_node, *new_snode; 3610 | + struct rb_node *parent, **new; 3611 | + 3612 | +research: 3613 | + parent = NULL; 3614 | + new = &tree_node->sub_root.rb_node; 3615 | + BUG_ON(!*new); 3616 | + hash_max = rmap_item_hash_max(rmap_item, hash); 3617 | + while (*new) { 3618 | + int cmp; 3619 | + 3620 | + stable_node = rb_entry(*new, struct stable_node, node); 3621 | + 3622 | + cmp = hash_cmp(hash_max, stable_node->hash_max); 3623 | + 3624 | + if (cmp < 0) { 3625 | + parent = *new; 3626 | + new = &parent->rb_left; 3627 | + } else if (cmp > 0) { 3628 | + parent = *new; 3629 | + new = &parent->rb_right; 3630 | + } else { 3631 | + tree_page = get_uksm_page(stable_node, 1, 0); 3632 | + if (tree_page) { 3633 | + cmp = memcmp_pages(*kpage, tree_page, 1); 3634 | + if (!cmp) { 3635 | + try_merge_with_stable(rmap_item, 3636 | + tree_rmap_item, kpage, 3637 | + tree_page, success1, success2); 3638 | + 3639 | + put_page(tree_page); 3640 | + if (!*success1 && !*success2) 3641 | + goto failed; 3642 | + /* 3643 | + * successfully merged with a stable 3644 | + * node 3645 | + */ 3646 | + return stable_node; 3647 | + } else { 3648 | + put_page(tree_page); 3649 | + goto failed; 3650 | + } 3651 | + } else { 3652 | + /* 3653 | + * stable node may be deleted, 3654 | + * and subtree maybe 3655 | + * restructed, cannot 3656 | + * continue, research it. 3657 | + */ 3658 | + if (tree_node->count) { 3659 | + goto research; 3660 | + } else { 3661 | + /* reuse the tree node*/ 3662 | + parent = NULL; 3663 | + new = &tree_node->sub_root.rb_node; 3664 | + } 3665 | + } 3666 | + } 3667 | + } 3668 | + 3669 | + new_snode = new_stable_node(tree_node, *kpage, hash_max); 3670 | + if (!new_snode) 3671 | + goto failed; 3672 | + 3673 | + rb_link_node(&new_snode->node, parent, new); 3674 | + rb_insert_color(&new_snode->node, &tree_node->sub_root); 3675 | + tree_node->count++; 3676 | + *success1 = *success2 = 1; 3677 | + 3678 | + return new_snode; 3679 | + 3680 | +failed: 3681 | + return NULL; 3682 | +} 3683 | + 3684 | + 3685 | +/** 3686 | + * stable_tree_insert() - try to insert a merged page in unstable tree to 3687 | + * the stable tree 3688 | + * 3689 | + * @kpage: the page need to be inserted 3690 | + * @hash: the current hash of this page 3691 | + * @rmap_item: the rmap_item being scanned 3692 | + * @tree_rmap_item: the rmap_item found on unstable tree 3693 | + * @success1: return if rmap_item is merged 3694 | + * @success2: return if tree_rmap_item is merged 3695 | + * 3696 | + * @return the stable_node on stable tree if at least one 3697 | + * rmap_item is inserted into stable tree, NULL 3698 | + * otherwise. 3699 | + */ 3700 | +static struct stable_node * 3701 | +stable_tree_insert(struct page **kpage, u32 hash, 3702 | + struct rmap_item *rmap_item, 3703 | + struct rmap_item *tree_rmap_item, 3704 | + int *success1, int *success2) 3705 | +{ 3706 | + struct rb_node **new = &root_stable_treep->rb_node; 3707 | + struct rb_node *parent = NULL; 3708 | + struct stable_node *stable_node; 3709 | + struct tree_node *tree_node; 3710 | + u32 hash_max = 0; 3711 | + 3712 | + *success1 = *success2 = 0; 3713 | + 3714 | + while (*new) { 3715 | + int cmp; 3716 | + 3717 | + tree_node = rb_entry(*new, struct tree_node, node); 3718 | + 3719 | + cmp = hash_cmp(hash, tree_node->hash); 3720 | + 3721 | + if (cmp < 0) { 3722 | + parent = *new; 3723 | + new = &parent->rb_left; 3724 | + } else if (cmp > 0) { 3725 | + parent = *new; 3726 | + new = &parent->rb_right; 3727 | + } else 3728 | + break; 3729 | + } 3730 | + 3731 | + if (*new) { 3732 | + if (tree_node->count == 1) { 3733 | + stable_node = first_level_insert(tree_node, rmap_item, 3734 | + tree_rmap_item, kpage, 3735 | + hash, success1, success2); 3736 | + } else { 3737 | + stable_node = stable_subtree_insert(tree_node, 3738 | + rmap_item, tree_rmap_item, kpage, 3739 | + hash, success1, success2); 3740 | + } 3741 | + } else { 3742 | + 3743 | + /* no tree node found */ 3744 | + tree_node = alloc_tree_node(stable_tree_node_listp); 3745 | + if (!tree_node) { 3746 | + stable_node = NULL; 3747 | + goto out; 3748 | + } 3749 | + 3750 | + stable_node = new_stable_node(tree_node, *kpage, hash_max); 3751 | + if (!stable_node) { 3752 | + free_tree_node(tree_node); 3753 | + goto out; 3754 | + } 3755 | + 3756 | + tree_node->hash = hash; 3757 | + rb_link_node(&tree_node->node, parent, new); 3758 | + rb_insert_color(&tree_node->node, root_stable_treep); 3759 | + parent = NULL; 3760 | + new = &tree_node->sub_root.rb_node; 3761 | + 3762 | + rb_link_node(&stable_node->node, parent, new); 3763 | + rb_insert_color(&stable_node->node, &tree_node->sub_root); 3764 | + tree_node->count++; 3765 | + *success1 = *success2 = 1; 3766 | + } 3767 | + 3768 | +out: 3769 | + return stable_node; 3770 | +} 3771 | + 3772 | + 3773 | +/** 3774 | + * get_tree_rmap_item_page() - try to get the page and lock the mmap_sem 3775 | + * 3776 | + * @return 0 on success, -EBUSY if unable to lock the mmap_sem, 3777 | + * -EINVAL if the page mapping has been changed. 3778 | + */ 3779 | +static inline int get_tree_rmap_item_page(struct rmap_item *tree_rmap_item) 3780 | +{ 3781 | + int err; 3782 | + 3783 | + err = get_mergeable_page_lock_mmap(tree_rmap_item); 3784 | + 3785 | + if (err == -EINVAL) { 3786 | + /* its page map has been changed, remove it */ 3787 | + remove_rmap_item_from_tree(tree_rmap_item); 3788 | + } 3789 | + 3790 | + /* The page is gotten and mmap_sem is locked now. */ 3791 | + return err; 3792 | +} 3793 | + 3794 | + 3795 | +/** 3796 | + * unstable_tree_search_insert() - search an unstable tree rmap_item with the 3797 | + * same hash value. Get its page and trylock the mmap_sem 3798 | + */ 3799 | +static inline 3800 | +struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, 3801 | + u32 hash) 3802 | + 3803 | +{ 3804 | + struct rb_node **new = &root_unstable_tree.rb_node; 3805 | + struct rb_node *parent = NULL; 3806 | + struct tree_node *tree_node; 3807 | + u32 hash_max; 3808 | + struct rmap_item *tree_rmap_item; 3809 | + 3810 | + while (*new) { 3811 | + int cmp; 3812 | + 3813 | + tree_node = rb_entry(*new, struct tree_node, node); 3814 | + 3815 | + cmp = hash_cmp(hash, tree_node->hash); 3816 | + 3817 | + if (cmp < 0) { 3818 | + parent = *new; 3819 | + new = &parent->rb_left; 3820 | + } else if (cmp > 0) { 3821 | + parent = *new; 3822 | + new = &parent->rb_right; 3823 | + } else 3824 | + break; 3825 | + } 3826 | + 3827 | + if (*new) { 3828 | + /* got the tree_node */ 3829 | + if (tree_node->count == 1) { 3830 | + tree_rmap_item = rb_entry(tree_node->sub_root.rb_node, 3831 | + struct rmap_item, node); 3832 | + BUG_ON(!tree_rmap_item); 3833 | + 3834 | + goto get_page_out; 3835 | + } 3836 | + 3837 | + /* well, search the collision subtree */ 3838 | + new = &tree_node->sub_root.rb_node; 3839 | + BUG_ON(!*new); 3840 | + hash_max = rmap_item_hash_max(rmap_item, hash); 3841 | + 3842 | + while (*new) { 3843 | + int cmp; 3844 | + 3845 | + tree_rmap_item = rb_entry(*new, struct rmap_item, 3846 | + node); 3847 | + 3848 | + cmp = hash_cmp(hash_max, tree_rmap_item->hash_max); 3849 | + parent = *new; 3850 | + if (cmp < 0) 3851 | + new = &parent->rb_left; 3852 | + else if (cmp > 0) 3853 | + new = &parent->rb_right; 3854 | + else 3855 | + goto get_page_out; 3856 | + } 3857 | + } else { 3858 | + /* alloc a new tree_node */ 3859 | + tree_node = alloc_tree_node(&unstable_tree_node_list); 3860 | + if (!tree_node) 3861 | + return NULL; 3862 | + 3863 | + tree_node->hash = hash; 3864 | + rb_link_node(&tree_node->node, parent, new); 3865 | + rb_insert_color(&tree_node->node, &root_unstable_tree); 3866 | + parent = NULL; 3867 | + new = &tree_node->sub_root.rb_node; 3868 | + } 3869 | + 3870 | + /* did not found even in sub-tree */ 3871 | + rmap_item->tree_node = tree_node; 3872 | + rmap_item->address |= UNSTABLE_FLAG; 3873 | + rmap_item->hash_round = uksm_hash_round; 3874 | + rb_link_node(&rmap_item->node, parent, new); 3875 | + rb_insert_color(&rmap_item->node, &tree_node->sub_root); 3876 | + 3877 | + uksm_pages_unshared++; 3878 | + return NULL; 3879 | + 3880 | +get_page_out: 3881 | + if (tree_rmap_item->page == rmap_item->page) 3882 | + return NULL; 3883 | + 3884 | + if (get_tree_rmap_item_page(tree_rmap_item)) 3885 | + return NULL; 3886 | + 3887 | + return tree_rmap_item; 3888 | +} 3889 | + 3890 | +static void hold_anon_vma(struct rmap_item *rmap_item, 3891 | + struct anon_vma *anon_vma) 3892 | +{ 3893 | + rmap_item->anon_vma = anon_vma; 3894 | + get_anon_vma(anon_vma); 3895 | +} 3896 | + 3897 | + 3898 | +/** 3899 | + * stable_tree_append() - append a rmap_item to a stable node. Deduplication 3900 | + * ratio statistics is done in this function. 3901 | + * 3902 | + */ 3903 | +static void stable_tree_append(struct rmap_item *rmap_item, 3904 | + struct stable_node *stable_node, int logdedup) 3905 | +{ 3906 | + struct node_vma *node_vma = NULL, *new_node_vma, *node_vma_cont = NULL; 3907 | + unsigned long key = (unsigned long)rmap_item->slot; 3908 | + unsigned long factor = rmap_item->slot->rung->step; 3909 | + 3910 | + BUG_ON(!stable_node); 3911 | + rmap_item->address |= STABLE_FLAG; 3912 | + 3913 | + if (hlist_empty(&stable_node->hlist)) { 3914 | + uksm_pages_shared++; 3915 | + goto node_vma_new; 3916 | + } else { 3917 | + uksm_pages_sharing++; 3918 | + } 3919 | + 3920 | + hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) { 3921 | + if (node_vma->key >= key) 3922 | + break; 3923 | + 3924 | + if (logdedup) { 3925 | + node_vma->slot->pages_bemerged += factor; 3926 | + if (list_empty(&node_vma->slot->dedup_list)) 3927 | + list_add(&node_vma->slot->dedup_list, 3928 | + &vma_slot_dedup); 3929 | + } 3930 | + } 3931 | + 3932 | + if (node_vma) { 3933 | + if (node_vma->key == key) { 3934 | + node_vma_cont = hlist_entry_safe(node_vma->hlist.next, struct node_vma, hlist); 3935 | + goto node_vma_ok; 3936 | + } else if (node_vma->key > key) { 3937 | + node_vma_cont = node_vma; 3938 | + } 3939 | + } 3940 | + 3941 | +node_vma_new: 3942 | + /* no same vma already in node, alloc a new node_vma */ 3943 | + new_node_vma = alloc_node_vma(); 3944 | + BUG_ON(!new_node_vma); 3945 | + new_node_vma->head = stable_node; 3946 | + new_node_vma->slot = rmap_item->slot; 3947 | + 3948 | + if (!node_vma) { 3949 | + hlist_add_head(&new_node_vma->hlist, &stable_node->hlist); 3950 | + } else if (node_vma->key != key) { 3951 | + if (node_vma->key < key) 3952 | + hlist_add_behind(&new_node_vma->hlist, &node_vma->hlist); 3953 | + else { 3954 | + hlist_add_before(&new_node_vma->hlist, 3955 | + &node_vma->hlist); 3956 | + } 3957 | + 3958 | + } 3959 | + node_vma = new_node_vma; 3960 | + 3961 | +node_vma_ok: /* ok, ready to add to the list */ 3962 | + rmap_item->head = node_vma; 3963 | + hlist_add_head(&rmap_item->hlist, &node_vma->rmap_hlist); 3964 | + hold_anon_vma(rmap_item, rmap_item->slot->vma->anon_vma); 3965 | + if (logdedup) { 3966 | + rmap_item->slot->pages_merged++; 3967 | + if (node_vma_cont) { 3968 | + node_vma = node_vma_cont; 3969 | + hlist_for_each_entry_continue(node_vma, hlist) { 3970 | + node_vma->slot->pages_bemerged += factor; 3971 | + if (list_empty(&node_vma->slot->dedup_list)) 3972 | + list_add(&node_vma->slot->dedup_list, 3973 | + &vma_slot_dedup); 3974 | + } 3975 | + } 3976 | + } 3977 | +} 3978 | + 3979 | +/* 3980 | + * We use break_ksm to break COW on a ksm page: it's a stripped down 3981 | + * 3982 | + * if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1) 3983 | + * put_page(page); 3984 | + * 3985 | + * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, 3986 | + * in case the application has unmapped and remapped mm,addr meanwhile. 3987 | + * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP 3988 | + * mmap of /dev/mem or /dev/kmem, where we would not want to touch it. 3989 | + */ 3990 | +static int break_ksm(struct vm_area_struct *vma, unsigned long addr) 3991 | +{ 3992 | + struct page *page; 3993 | + int ret = 0; 3994 | + 3995 | + do { 3996 | + cond_resched(); 3997 | + page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE); 3998 | + if (IS_ERR_OR_NULL(page)) 3999 | + break; 4000 | + if (PageKsm(page)) { 4001 | + ret = handle_mm_fault(vma->vm_mm, vma, addr, 4002 | + FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE); 4003 | + } else 4004 | + ret = VM_FAULT_WRITE; 4005 | + put_page(page); 4006 | + } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM))); 4007 | + /* 4008 | + * We must loop because handle_mm_fault() may back out if there's 4009 | + * any difficulty e.g. if pte accessed bit gets updated concurrently. 4010 | + * 4011 | + * VM_FAULT_WRITE is what we have been hoping for: it indicates that 4012 | + * COW has been broken, even if the vma does not permit VM_WRITE; 4013 | + * but note that a concurrent fault might break PageKsm for us. 4014 | + * 4015 | + * VM_FAULT_SIGBUS could occur if we race with truncation of the 4016 | + * backing file, which also invalidates anonymous pages: that's 4017 | + * okay, that truncation will have unmapped the PageKsm for us. 4018 | + * 4019 | + * VM_FAULT_OOM: at the time of writing (late July 2009), setting 4020 | + * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the 4021 | + * current task has TIF_MEMDIE set, and will be OOM killed on return 4022 | + * to user; and ksmd, having no mm, would never be chosen for that. 4023 | + * 4024 | + * But if the mm is in a limited mem_cgroup, then the fault may fail 4025 | + * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and 4026 | + * even ksmd can fail in this way - though it's usually breaking ksm 4027 | + * just to undo a merge it made a moment before, so unlikely to oom. 4028 | + * 4029 | + * That's a pity: we might therefore have more kernel pages allocated 4030 | + * than we're counting as nodes in the stable tree; but uksm_do_scan 4031 | + * will retry to break_cow on each pass, so should recover the page 4032 | + * in due course. The important thing is to not let VM_MERGEABLE 4033 | + * be cleared while any such pages might remain in the area. 4034 | + */ 4035 | + return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; 4036 | +} 4037 | + 4038 | +static void break_cow(struct rmap_item *rmap_item) 4039 | +{ 4040 | + struct vm_area_struct *vma = rmap_item->slot->vma; 4041 | + struct mm_struct *mm = vma->vm_mm; 4042 | + unsigned long addr = get_rmap_addr(rmap_item); 4043 | + 4044 | + if (uksm_test_exit(mm)) 4045 | + goto out; 4046 | + 4047 | + break_ksm(vma, addr); 4048 | +out: 4049 | + return; 4050 | +} 4051 | + 4052 | +/* 4053 | + * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather 4054 | + * than check every pte of a given vma, the locking doesn't quite work for 4055 | + * that - an rmap_item is assigned to the stable tree after inserting ksm 4056 | + * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing 4057 | + * rmap_items from parent to child at fork time (so as not to waste time 4058 | + * if exit comes before the next scan reaches it). 4059 | + * 4060 | + * Similarly, although we'd like to remove rmap_items (so updating counts 4061 | + * and freeing memory) when unmerging an area, it's easier to leave that 4062 | + * to the next pass of ksmd - consider, for example, how ksmd might be 4063 | + * in cmp_and_merge_page on one of the rmap_items we would be removing. 4064 | + */ 4065 | +inline int unmerge_uksm_pages(struct vm_area_struct *vma, 4066 | + unsigned long start, unsigned long end) 4067 | +{ 4068 | + unsigned long addr; 4069 | + int err = 0; 4070 | + 4071 | + for (addr = start; addr < end && !err; addr += PAGE_SIZE) { 4072 | + if (uksm_test_exit(vma->vm_mm)) 4073 | + break; 4074 | + if (signal_pending(current)) 4075 | + err = -ERESTARTSYS; 4076 | + else 4077 | + err = break_ksm(vma, addr); 4078 | + } 4079 | + return err; 4080 | +} 4081 | + 4082 | +static inline void inc_uksm_pages_scanned(void) 4083 | +{ 4084 | + u64 delta; 4085 | + 4086 | + 4087 | + if (uksm_pages_scanned == U64_MAX) { 4088 | + encode_benefit(); 4089 | + 4090 | + delta = uksm_pages_scanned >> pages_scanned_base; 4091 | + 4092 | + if (CAN_OVERFLOW_U64(pages_scanned_stored, delta)) { 4093 | + pages_scanned_stored >>= 1; 4094 | + delta >>= 1; 4095 | + pages_scanned_base++; 4096 | + } 4097 | + 4098 | + pages_scanned_stored += delta; 4099 | + 4100 | + uksm_pages_scanned = uksm_pages_scanned_last = 0; 4101 | + } 4102 | + 4103 | + uksm_pages_scanned++; 4104 | +} 4105 | + 4106 | +static inline int find_zero_page_hash(int strength, u32 hash) 4107 | +{ 4108 | + return (zero_hash_table[strength] == hash); 4109 | +} 4110 | + 4111 | +static 4112 | +int cmp_and_merge_zero_page(struct vm_area_struct *vma, struct page *page) 4113 | +{ 4114 | + struct page *zero_page = empty_uksm_zero_page; 4115 | + struct mm_struct *mm = vma->vm_mm; 4116 | + pte_t orig_pte = __pte(0); 4117 | + int err = -EFAULT; 4118 | + 4119 | + if (uksm_test_exit(mm)) 4120 | + goto out; 4121 | + 4122 | + if (!trylock_page(page)) 4123 | + goto out; 4124 | + 4125 | + if (!PageAnon(page)) 4126 | + goto out_unlock; 4127 | + 4128 | + if (PageTransCompound(page)) { 4129 | + err = split_huge_page(page); 4130 | + if (err) 4131 | + goto out_unlock; 4132 | + } 4133 | + 4134 | + if (write_protect_page(vma, page, &orig_pte, 0) == 0) { 4135 | + if (is_page_full_zero(page)) 4136 | + err = replace_page(vma, page, zero_page, orig_pte); 4137 | + } 4138 | + 4139 | +out_unlock: 4140 | + unlock_page(page); 4141 | +out: 4142 | + return err; 4143 | +} 4144 | + 4145 | +/* 4146 | + * cmp_and_merge_page() - first see if page can be merged into the stable 4147 | + * tree; if not, compare hash to previous and if it's the same, see if page 4148 | + * can be inserted into the unstable tree, or merged with a page already there 4149 | + * and both transferred to the stable tree. 4150 | + * 4151 | + * @page: the page that we are searching identical page to. 4152 | + * @rmap_item: the reverse mapping into the virtual address of this page 4153 | + */ 4154 | +static void cmp_and_merge_page(struct rmap_item *rmap_item, u32 hash) 4155 | +{ 4156 | + struct rmap_item *tree_rmap_item; 4157 | + struct page *page; 4158 | + struct page *kpage = NULL; 4159 | + u32 hash_max; 4160 | + int err; 4161 | + unsigned int success1, success2; 4162 | + struct stable_node *snode; 4163 | + int cmp; 4164 | + struct rb_node *parent = NULL, **new; 4165 | + 4166 | + remove_rmap_item_from_tree(rmap_item); 4167 | + page = rmap_item->page; 4168 | + 4169 | + /* We first start with searching the page inside the stable tree */ 4170 | + kpage = stable_tree_search(rmap_item, hash); 4171 | + if (kpage) { 4172 | + err = try_to_merge_with_uksm_page(rmap_item, kpage, 4173 | + hash); 4174 | + if (!err) { 4175 | + /* 4176 | + * The page was successfully merged, add 4177 | + * its rmap_item to the stable tree. 4178 | + * page lock is needed because it's 4179 | + * racing with try_to_unmap_ksm(), etc. 4180 | + */ 4181 | + lock_page(kpage); 4182 | + snode = page_stable_node(kpage); 4183 | + stable_tree_append(rmap_item, snode, 1); 4184 | + unlock_page(kpage); 4185 | + put_page(kpage); 4186 | + return; /* success */ 4187 | + } 4188 | + put_page(kpage); 4189 | + 4190 | + /* 4191 | + * if it's a collision and it has been search in sub-rbtree 4192 | + * (hash_max != 0), we want to abort, because if it is 4193 | + * successfully merged in unstable tree, the collision trends to 4194 | + * happen again. 4195 | + */ 4196 | + if (err == MERGE_ERR_COLLI && rmap_item->hash_max) 4197 | + return; 4198 | + } 4199 | + 4200 | + tree_rmap_item = 4201 | + unstable_tree_search_insert(rmap_item, hash); 4202 | + if (tree_rmap_item) { 4203 | + err = try_to_merge_two_pages(rmap_item, tree_rmap_item, hash); 4204 | + /* 4205 | + * As soon as we merge this page, we want to remove the 4206 | + * rmap_item of the page we have merged with from the unstable 4207 | + * tree, and insert it instead as new node in the stable tree. 4208 | + */ 4209 | + if (!err) { 4210 | + kpage = page; 4211 | + remove_rmap_item_from_tree(tree_rmap_item); 4212 | + lock_page(kpage); 4213 | + snode = stable_tree_insert(&kpage, hash, 4214 | + rmap_item, tree_rmap_item, 4215 | + &success1, &success2); 4216 | + 4217 | + /* 4218 | + * Do not log dedup for tree item, it's not counted as 4219 | + * scanned in this round. 4220 | + */ 4221 | + if (success2) 4222 | + stable_tree_append(tree_rmap_item, snode, 0); 4223 | + 4224 | + /* 4225 | + * The order of these two stable append is important: 4226 | + * we are scanning rmap_item. 4227 | + */ 4228 | + if (success1) 4229 | + stable_tree_append(rmap_item, snode, 1); 4230 | + 4231 | + /* 4232 | + * The original kpage may be unlocked inside 4233 | + * stable_tree_insert() already. This page 4234 | + * should be unlocked before doing 4235 | + * break_cow(). 4236 | + */ 4237 | + unlock_page(kpage); 4238 | + 4239 | + if (!success1) 4240 | + break_cow(rmap_item); 4241 | + 4242 | + if (!success2) 4243 | + break_cow(tree_rmap_item); 4244 | + 4245 | + } else if (err == MERGE_ERR_COLLI) { 4246 | + BUG_ON(tree_rmap_item->tree_node->count > 1); 4247 | + 4248 | + rmap_item_hash_max(tree_rmap_item, 4249 | + tree_rmap_item->tree_node->hash); 4250 | + 4251 | + hash_max = rmap_item_hash_max(rmap_item, hash); 4252 | + cmp = hash_cmp(hash_max, tree_rmap_item->hash_max); 4253 | + parent = &tree_rmap_item->node; 4254 | + if (cmp < 0) 4255 | + new = &parent->rb_left; 4256 | + else if (cmp > 0) 4257 | + new = &parent->rb_right; 4258 | + else 4259 | + goto put_up_out; 4260 | + 4261 | + rmap_item->tree_node = tree_rmap_item->tree_node; 4262 | + rmap_item->address |= UNSTABLE_FLAG; 4263 | + rmap_item->hash_round = uksm_hash_round; 4264 | + rb_link_node(&rmap_item->node, parent, new); 4265 | + rb_insert_color(&rmap_item->node, 4266 | + &tree_rmap_item->tree_node->sub_root); 4267 | + rmap_item->tree_node->count++; 4268 | + } else { 4269 | + /* 4270 | + * either one of the page has changed or they collide 4271 | + * at the max hash, we consider them as ill items. 4272 | + */ 4273 | + remove_rmap_item_from_tree(tree_rmap_item); 4274 | + } 4275 | +put_up_out: 4276 | + put_page(tree_rmap_item->page); 4277 | + up_read(&tree_rmap_item->slot->vma->vm_mm->mmap_sem); 4278 | + } 4279 | +} 4280 | + 4281 | + 4282 | + 4283 | + 4284 | +static inline unsigned long get_pool_index(struct vma_slot *slot, 4285 | + unsigned long index) 4286 | +{ 4287 | + unsigned long pool_index; 4288 | + 4289 | + pool_index = (sizeof(struct rmap_list_entry *) * index) >> PAGE_SHIFT; 4290 | + if (pool_index >= slot->pool_size) 4291 | + BUG(); 4292 | + return pool_index; 4293 | +} 4294 | + 4295 | +static inline unsigned long index_page_offset(unsigned long index) 4296 | +{ 4297 | + return offset_in_page(sizeof(struct rmap_list_entry *) * index); 4298 | +} 4299 | + 4300 | +static inline 4301 | +struct rmap_list_entry *get_rmap_list_entry(struct vma_slot *slot, 4302 | + unsigned long index, int need_alloc) 4303 | +{ 4304 | + unsigned long pool_index; 4305 | + struct page *page; 4306 | + void *addr; 4307 | + 4308 | + 4309 | + pool_index = get_pool_index(slot, index); 4310 | + if (!slot->rmap_list_pool[pool_index]) { 4311 | + if (!need_alloc) 4312 | + return NULL; 4313 | + 4314 | + page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN); 4315 | + if (!page) 4316 | + return NULL; 4317 | + 4318 | + slot->rmap_list_pool[pool_index] = page; 4319 | + } 4320 | + 4321 | + addr = kmap(slot->rmap_list_pool[pool_index]); 4322 | + addr += index_page_offset(index); 4323 | + 4324 | + return addr; 4325 | +} 4326 | + 4327 | +static inline void put_rmap_list_entry(struct vma_slot *slot, 4328 | + unsigned long index) 4329 | +{ 4330 | + unsigned long pool_index; 4331 | + 4332 | + pool_index = get_pool_index(slot, index); 4333 | + BUG_ON(!slot->rmap_list_pool[pool_index]); 4334 | + kunmap(slot->rmap_list_pool[pool_index]); 4335 | +} 4336 | + 4337 | +static inline int entry_is_new(struct rmap_list_entry *entry) 4338 | +{ 4339 | + return !entry->item; 4340 | +} 4341 | + 4342 | +static inline unsigned long get_index_orig_addr(struct vma_slot *slot, 4343 | + unsigned long index) 4344 | +{ 4345 | + return slot->vma->vm_start + (index << PAGE_SHIFT); 4346 | +} 4347 | + 4348 | +static inline unsigned long get_entry_address(struct rmap_list_entry *entry) 4349 | +{ 4350 | + unsigned long addr; 4351 | + 4352 | + if (is_addr(entry->addr)) 4353 | + addr = get_clean_addr(entry->addr); 4354 | + else if (entry->item) 4355 | + addr = get_rmap_addr(entry->item); 4356 | + else 4357 | + BUG(); 4358 | + 4359 | + return addr; 4360 | +} 4361 | + 4362 | +static inline struct rmap_item *get_entry_item(struct rmap_list_entry *entry) 4363 | +{ 4364 | + if (is_addr(entry->addr)) 4365 | + return NULL; 4366 | + 4367 | + return entry->item; 4368 | +} 4369 | + 4370 | +static inline void inc_rmap_list_pool_count(struct vma_slot *slot, 4371 | + unsigned long index) 4372 | +{ 4373 | + unsigned long pool_index; 4374 | + 4375 | + pool_index = get_pool_index(slot, index); 4376 | + BUG_ON(!slot->rmap_list_pool[pool_index]); 4377 | + slot->pool_counts[pool_index]++; 4378 | +} 4379 | + 4380 | +static inline void dec_rmap_list_pool_count(struct vma_slot *slot, 4381 | + unsigned long index) 4382 | +{ 4383 | + unsigned long pool_index; 4384 | + 4385 | + pool_index = get_pool_index(slot, index); 4386 | + BUG_ON(!slot->rmap_list_pool[pool_index]); 4387 | + BUG_ON(!slot->pool_counts[pool_index]); 4388 | + slot->pool_counts[pool_index]--; 4389 | +} 4390 | + 4391 | +static inline int entry_has_rmap(struct rmap_list_entry *entry) 4392 | +{ 4393 | + return !is_addr(entry->addr) && entry->item; 4394 | +} 4395 | + 4396 | +static inline void swap_entries(struct rmap_list_entry *entry1, 4397 | + unsigned long index1, 4398 | + struct rmap_list_entry *entry2, 4399 | + unsigned long index2) 4400 | +{ 4401 | + struct rmap_list_entry tmp; 4402 | + 4403 | + /* swapping two new entries is meaningless */ 4404 | + BUG_ON(entry_is_new(entry1) && entry_is_new(entry2)); 4405 | + 4406 | + tmp = *entry1; 4407 | + *entry1 = *entry2; 4408 | + *entry2 = tmp; 4409 | + 4410 | + if (entry_has_rmap(entry1)) 4411 | + entry1->item->entry_index = index1; 4412 | + 4413 | + if (entry_has_rmap(entry2)) 4414 | + entry2->item->entry_index = index2; 4415 | + 4416 | + if (entry_has_rmap(entry1) && !entry_has_rmap(entry2)) { 4417 | + inc_rmap_list_pool_count(entry1->item->slot, index1); 4418 | + dec_rmap_list_pool_count(entry1->item->slot, index2); 4419 | + } else if (!entry_has_rmap(entry1) && entry_has_rmap(entry2)) { 4420 | + inc_rmap_list_pool_count(entry2->item->slot, index2); 4421 | + dec_rmap_list_pool_count(entry2->item->slot, index1); 4422 | + } 4423 | +} 4424 | + 4425 | +static inline void free_entry_item(struct rmap_list_entry *entry) 4426 | +{ 4427 | + unsigned long index; 4428 | + struct rmap_item *item; 4429 | + 4430 | + if (!is_addr(entry->addr)) { 4431 | + BUG_ON(!entry->item); 4432 | + item = entry->item; 4433 | + entry->addr = get_rmap_addr(item); 4434 | + set_is_addr(entry->addr); 4435 | + index = item->entry_index; 4436 | + remove_rmap_item_from_tree(item); 4437 | + dec_rmap_list_pool_count(item->slot, index); 4438 | + free_rmap_item(item); 4439 | + } 4440 | +} 4441 | + 4442 | +static inline int pool_entry_boundary(unsigned long index) 4443 | +{ 4444 | + unsigned long linear_addr; 4445 | + 4446 | + linear_addr = sizeof(struct rmap_list_entry *) * index; 4447 | + return index && !offset_in_page(linear_addr); 4448 | +} 4449 | + 4450 | +static inline void try_free_last_pool(struct vma_slot *slot, 4451 | + unsigned long index) 4452 | +{ 4453 | + unsigned long pool_index; 4454 | + 4455 | + pool_index = get_pool_index(slot, index); 4456 | + if (slot->rmap_list_pool[pool_index] && 4457 | + !slot->pool_counts[pool_index]) { 4458 | + __free_page(slot->rmap_list_pool[pool_index]); 4459 | + slot->rmap_list_pool[pool_index] = NULL; 4460 | + slot->flags |= UKSM_SLOT_NEED_SORT; 4461 | + } 4462 | + 4463 | +} 4464 | + 4465 | +static inline unsigned long vma_item_index(struct vm_area_struct *vma, 4466 | + struct rmap_item *item) 4467 | +{ 4468 | + return (get_rmap_addr(item) - vma->vm_start) >> PAGE_SHIFT; 4469 | +} 4470 | + 4471 | +static int within_same_pool(struct vma_slot *slot, 4472 | + unsigned long i, unsigned long j) 4473 | +{ 4474 | + unsigned long pool_i, pool_j; 4475 | + 4476 | + pool_i = get_pool_index(slot, i); 4477 | + pool_j = get_pool_index(slot, j); 4478 | + 4479 | + return (pool_i == pool_j); 4480 | +} 4481 | + 4482 | +static void sort_rmap_entry_list(struct vma_slot *slot) 4483 | +{ 4484 | + unsigned long i, j; 4485 | + struct rmap_list_entry *entry, *swap_entry; 4486 | + 4487 | + entry = get_rmap_list_entry(slot, 0, 0); 4488 | + for (i = 0; i < slot->pages; ) { 4489 | + 4490 | + if (!entry) 4491 | + goto skip_whole_pool; 4492 | + 4493 | + if (entry_is_new(entry)) 4494 | + goto next_entry; 4495 | + 4496 | + if (is_addr(entry->addr)) { 4497 | + entry->addr = 0; 4498 | + goto next_entry; 4499 | + } 4500 | + 4501 | + j = vma_item_index(slot->vma, entry->item); 4502 | + if (j == i) 4503 | + goto next_entry; 4504 | + 4505 | + if (within_same_pool(slot, i, j)) 4506 | + swap_entry = entry + j - i; 4507 | + else 4508 | + swap_entry = get_rmap_list_entry(slot, j, 1); 4509 | + 4510 | + swap_entries(entry, i, swap_entry, j); 4511 | + if (!within_same_pool(slot, i, j)) 4512 | + put_rmap_list_entry(slot, j); 4513 | + continue; 4514 | + 4515 | +skip_whole_pool: 4516 | + i += PAGE_SIZE / sizeof(*entry); 4517 | + if (i < slot->pages) 4518 | + entry = get_rmap_list_entry(slot, i, 0); 4519 | + continue; 4520 | + 4521 | +next_entry: 4522 | + if (i >= slot->pages - 1 || 4523 | + !within_same_pool(slot, i, i + 1)) { 4524 | + put_rmap_list_entry(slot, i); 4525 | + if (i + 1 < slot->pages) 4526 | + entry = get_rmap_list_entry(slot, i + 1, 0); 4527 | + } else 4528 | + entry++; 4529 | + i++; 4530 | + continue; 4531 | + } 4532 | + 4533 | + /* free empty pool entries which contain no rmap_item */ 4534 | + /* CAN be simplied to based on only pool_counts when bug freed !!!!! */ 4535 | + for (i = 0; i < slot->pool_size; i++) { 4536 | + unsigned char has_rmap; 4537 | + void *addr; 4538 | + 4539 | + if (!slot->rmap_list_pool[i]) 4540 | + continue; 4541 | + 4542 | + has_rmap = 0; 4543 | + addr = kmap(slot->rmap_list_pool[i]); 4544 | + BUG_ON(!addr); 4545 | + for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) { 4546 | + entry = (struct rmap_list_entry *)addr + j; 4547 | + if (is_addr(entry->addr)) 4548 | + continue; 4549 | + if (!entry->item) 4550 | + continue; 4551 | + has_rmap = 1; 4552 | + } 4553 | + kunmap(slot->rmap_list_pool[i]); 4554 | + if (!has_rmap) { 4555 | + BUG_ON(slot->pool_counts[i]); 4556 | + __free_page(slot->rmap_list_pool[i]); 4557 | + slot->rmap_list_pool[i] = NULL; 4558 | + } 4559 | + } 4560 | + 4561 | + slot->flags &= ~UKSM_SLOT_NEED_SORT; 4562 | +} 4563 | + 4564 | +/* 4565 | + * vma_fully_scanned() - if all the pages in this slot have been scanned. 4566 | + */ 4567 | +static inline int vma_fully_scanned(struct vma_slot *slot) 4568 | +{ 4569 | + return slot->pages_scanned == slot->pages; 4570 | +} 4571 | + 4572 | +/** 4573 | + * get_next_rmap_item() - Get the next rmap_item in a vma_slot according to 4574 | + * its random permutation. This function is embedded with the random 4575 | + * permutation index management code. 4576 | + */ 4577 | +static struct rmap_item *get_next_rmap_item(struct vma_slot *slot, u32 *hash) 4578 | +{ 4579 | + unsigned long rand_range, addr, swap_index, scan_index; 4580 | + struct rmap_item *item = NULL; 4581 | + struct rmap_list_entry *scan_entry, *swap_entry = NULL; 4582 | + struct page *page; 4583 | + 4584 | + scan_index = swap_index = slot->pages_scanned % slot->pages; 4585 | + 4586 | + if (pool_entry_boundary(scan_index)) 4587 | + try_free_last_pool(slot, scan_index - 1); 4588 | + 4589 | + if (vma_fully_scanned(slot)) { 4590 | + if (slot->flags & UKSM_SLOT_NEED_SORT) 4591 | + slot->flags |= UKSM_SLOT_NEED_RERAND; 4592 | + else 4593 | + slot->flags &= ~UKSM_SLOT_NEED_RERAND; 4594 | + if (slot->flags & UKSM_SLOT_NEED_SORT) 4595 | + sort_rmap_entry_list(slot); 4596 | + } 4597 | + 4598 | + scan_entry = get_rmap_list_entry(slot, scan_index, 1); 4599 | + if (!scan_entry) 4600 | + return NULL; 4601 | + 4602 | + if (entry_is_new(scan_entry)) { 4603 | + scan_entry->addr = get_index_orig_addr(slot, scan_index); 4604 | + set_is_addr(scan_entry->addr); 4605 | + } 4606 | + 4607 | + if (slot->flags & UKSM_SLOT_NEED_RERAND) { 4608 | + rand_range = slot->pages - scan_index; 4609 | + BUG_ON(!rand_range); 4610 | + swap_index = scan_index + (prandom_u32() % rand_range); 4611 | + } 4612 | + 4613 | + if (swap_index != scan_index) { 4614 | + swap_entry = get_rmap_list_entry(slot, swap_index, 1); 4615 | + if (entry_is_new(swap_entry)) { 4616 | + swap_entry->addr = get_index_orig_addr(slot, 4617 | + swap_index); 4618 | + set_is_addr(swap_entry->addr); 4619 | + } 4620 | + swap_entries(scan_entry, scan_index, swap_entry, swap_index); 4621 | + } 4622 | + 4623 | + addr = get_entry_address(scan_entry); 4624 | + item = get_entry_item(scan_entry); 4625 | + BUG_ON(addr > slot->vma->vm_end || addr < slot->vma->vm_start); 4626 | + 4627 | + page = follow_page(slot->vma, addr, FOLL_GET); 4628 | + if (IS_ERR_OR_NULL(page)) 4629 | + goto nopage; 4630 | + 4631 | + if (!PageAnon(page)) 4632 | + goto putpage; 4633 | + 4634 | + /*check is zero_page pfn or uksm_zero_page*/ 4635 | + if ((page_to_pfn(page) == zero_pfn) 4636 | + || (page_to_pfn(page) == uksm_zero_pfn)) 4637 | + goto putpage; 4638 | + 4639 | + flush_anon_page(slot->vma, page, addr); 4640 | + flush_dcache_page(page); 4641 | + 4642 | + 4643 | + *hash = page_hash(page, hash_strength, 1); 4644 | + inc_uksm_pages_scanned(); 4645 | + /*if the page content all zero, re-map to zero-page*/ 4646 | + if (find_zero_page_hash(hash_strength, *hash)) { 4647 | + if (!cmp_and_merge_zero_page(slot->vma, page)) { 4648 | + slot->pages_merged++; 4649 | + inc_zone_page_state(page, NR_UKSM_ZERO_PAGES); 4650 | + 4651 | + /* For full-zero pages, no need to create rmap item */ 4652 | + goto putpage; 4653 | + } else { 4654 | + inc_rshash_neg(memcmp_cost / 2); 4655 | + } 4656 | + } 4657 | + 4658 | + if (!item) { 4659 | + item = alloc_rmap_item(); 4660 | + if (item) { 4661 | + /* It has already been zeroed */ 4662 | + item->slot = slot; 4663 | + item->address = addr; 4664 | + item->entry_index = scan_index; 4665 | + scan_entry->item = item; 4666 | + inc_rmap_list_pool_count(slot, scan_index); 4667 | + } else 4668 | + goto putpage; 4669 | + } 4670 | + 4671 | + BUG_ON(item->slot != slot); 4672 | + /* the page may have changed */ 4673 | + item->page = page; 4674 | + put_rmap_list_entry(slot, scan_index); 4675 | + if (swap_entry) 4676 | + put_rmap_list_entry(slot, swap_index); 4677 | + return item; 4678 | + 4679 | +putpage: 4680 | + put_page(page); 4681 | + page = NULL; 4682 | +nopage: 4683 | + /* no page, store addr back and free rmap_item if possible */ 4684 | + free_entry_item(scan_entry); 4685 | + put_rmap_list_entry(slot, scan_index); 4686 | + if (swap_entry) 4687 | + put_rmap_list_entry(slot, swap_index); 4688 | + return NULL; 4689 | +} 4690 | + 4691 | +static inline int in_stable_tree(struct rmap_item *rmap_item) 4692 | +{ 4693 | + return rmap_item->address & STABLE_FLAG; 4694 | +} 4695 | + 4696 | +/** 4697 | + * scan_vma_one_page() - scan the next page in a vma_slot. Called with 4698 | + * mmap_sem locked. 4699 | + */ 4700 | +static noinline void scan_vma_one_page(struct vma_slot *slot) 4701 | +{ 4702 | + u32 hash; 4703 | + struct mm_struct *mm; 4704 | + struct rmap_item *rmap_item = NULL; 4705 | + struct vm_area_struct *vma = slot->vma; 4706 | + 4707 | + mm = vma->vm_mm; 4708 | + BUG_ON(!mm); 4709 | + BUG_ON(!slot); 4710 | + 4711 | + rmap_item = get_next_rmap_item(slot, &hash); 4712 | + if (!rmap_item) 4713 | + goto out1; 4714 | + 4715 | + if (PageKsm(rmap_item->page) && in_stable_tree(rmap_item)) 4716 | + goto out2; 4717 | + 4718 | + cmp_and_merge_page(rmap_item, hash); 4719 | +out2: 4720 | + put_page(rmap_item->page); 4721 | +out1: 4722 | + slot->pages_scanned++; 4723 | + slot->this_sampled++; 4724 | + if (slot->fully_scanned_round != fully_scanned_round) 4725 | + scanned_virtual_pages++; 4726 | + 4727 | + if (vma_fully_scanned(slot)) 4728 | + slot->fully_scanned_round = fully_scanned_round; 4729 | +} 4730 | + 4731 | +static inline unsigned long rung_get_pages(struct scan_rung *rung) 4732 | +{ 4733 | + struct slot_tree_node *node; 4734 | + 4735 | + if (!rung->vma_root.rnode) 4736 | + return 0; 4737 | + 4738 | + node = container_of(rung->vma_root.rnode, struct slot_tree_node, snode); 4739 | + 4740 | + return node->size; 4741 | +} 4742 | + 4743 | +#define RUNG_SAMPLED_MIN 3 4744 | + 4745 | +static inline 4746 | +void uksm_calc_rung_step(struct scan_rung *rung, 4747 | + unsigned long page_time, unsigned long ratio) 4748 | +{ 4749 | + unsigned long sampled, pages; 4750 | + 4751 | + /* will be fully scanned ? */ 4752 | + if (!rung->cover_msecs) { 4753 | + rung->step = 1; 4754 | + return; 4755 | + } 4756 | + 4757 | + sampled = rung->cover_msecs * (NSEC_PER_MSEC / TIME_RATIO_SCALE) 4758 | + * ratio / page_time; 4759 | + 4760 | + /* 4761 | + * Before we finsish a scan round and expensive per-round jobs, 4762 | + * we need to have a chance to estimate the per page time. So 4763 | + * the sampled number can not be too small. 4764 | + */ 4765 | + if (sampled < RUNG_SAMPLED_MIN) 4766 | + sampled = RUNG_SAMPLED_MIN; 4767 | + 4768 | + pages = rung_get_pages(rung); 4769 | + if (likely(pages > sampled)) 4770 | + rung->step = pages / sampled; 4771 | + else 4772 | + rung->step = 1; 4773 | +} 4774 | + 4775 | +static inline int step_need_recalc(struct scan_rung *rung) 4776 | +{ 4777 | + unsigned long pages, stepmax; 4778 | + 4779 | + pages = rung_get_pages(rung); 4780 | + stepmax = pages / RUNG_SAMPLED_MIN; 4781 | + 4782 | + return pages && (rung->step > pages || 4783 | + (stepmax && rung->step > stepmax)); 4784 | +} 4785 | + 4786 | +static inline 4787 | +void reset_current_scan(struct scan_rung *rung, int finished, int step_recalc) 4788 | +{ 4789 | + struct vma_slot *slot; 4790 | + 4791 | + if (finished) 4792 | + rung->flags |= UKSM_RUNG_ROUND_FINISHED; 4793 | + 4794 | + if (step_recalc || step_need_recalc(rung)) { 4795 | + uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio); 4796 | + BUG_ON(step_need_recalc(rung)); 4797 | + } 4798 | + 4799 | + slot_iter_index = prandom_u32() % rung->step; 4800 | + BUG_ON(!rung->vma_root.rnode); 4801 | + slot = sradix_tree_next(&rung->vma_root, NULL, 0, slot_iter); 4802 | + BUG_ON(!slot); 4803 | + 4804 | + rung->current_scan = slot; 4805 | + rung->current_offset = slot_iter_index; 4806 | +} 4807 | + 4808 | +static inline struct sradix_tree_root *slot_get_root(struct vma_slot *slot) 4809 | +{ 4810 | + return &slot->rung->vma_root; 4811 | +} 4812 | + 4813 | +/* 4814 | + * return if resetted. 4815 | + */ 4816 | +static int advance_current_scan(struct scan_rung *rung) 4817 | +{ 4818 | + unsigned short n; 4819 | + struct vma_slot *slot, *next = NULL; 4820 | + 4821 | + BUG_ON(!rung->vma_root.num); 4822 | + 4823 | + slot = rung->current_scan; 4824 | + n = (slot->pages - rung->current_offset) % rung->step; 4825 | + slot_iter_index = rung->step - n; 4826 | + next = sradix_tree_next(&rung->vma_root, slot->snode, 4827 | + slot->sindex, slot_iter); 4828 | + 4829 | + if (next) { 4830 | + rung->current_offset = slot_iter_index; 4831 | + rung->current_scan = next; 4832 | + return 0; 4833 | + } else { 4834 | + reset_current_scan(rung, 1, 0); 4835 | + return 1; 4836 | + } 4837 | +} 4838 | + 4839 | +static inline void rung_rm_slot(struct vma_slot *slot) 4840 | +{ 4841 | + struct scan_rung *rung = slot->rung; 4842 | + struct sradix_tree_root *root; 4843 | + 4844 | + if (rung->current_scan == slot) 4845 | + advance_current_scan(rung); 4846 | + 4847 | + root = slot_get_root(slot); 4848 | + sradix_tree_delete_from_leaf(root, slot->snode, slot->sindex); 4849 | + slot->snode = NULL; 4850 | + if (step_need_recalc(rung)) { 4851 | + uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio); 4852 | + BUG_ON(step_need_recalc(rung)); 4853 | + } 4854 | + 4855 | + /* In case advance_current_scan loop back to this slot again */ 4856 | + if (rung->vma_root.num && rung->current_scan == slot) 4857 | + reset_current_scan(slot->rung, 1, 0); 4858 | +} 4859 | + 4860 | +static inline void rung_add_new_slots(struct scan_rung *rung, 4861 | + struct vma_slot **slots, unsigned long num) 4862 | +{ 4863 | + int err; 4864 | + struct vma_slot *slot; 4865 | + unsigned long i; 4866 | + struct sradix_tree_root *root = &rung->vma_root; 4867 | + 4868 | + err = sradix_tree_enter(root, (void **)slots, num); 4869 | + BUG_ON(err); 4870 | + 4871 | + for (i = 0; i < num; i++) { 4872 | + slot = slots[i]; 4873 | + slot->rung = rung; 4874 | + BUG_ON(vma_fully_scanned(slot)); 4875 | + } 4876 | + 4877 | + if (rung->vma_root.num == num) 4878 | + reset_current_scan(rung, 0, 1); 4879 | +} 4880 | + 4881 | +static inline int rung_add_one_slot(struct scan_rung *rung, 4882 | + struct vma_slot *slot) 4883 | +{ 4884 | + int err; 4885 | + 4886 | + err = sradix_tree_enter(&rung->vma_root, (void **)&slot, 1); 4887 | + if (err) 4888 | + return err; 4889 | + 4890 | + slot->rung = rung; 4891 | + if (rung->vma_root.num == 1) 4892 | + reset_current_scan(rung, 0, 1); 4893 | + 4894 | + return 0; 4895 | +} 4896 | + 4897 | +/* 4898 | + * Return true if the slot is deleted from its rung. 4899 | + */ 4900 | +static inline int vma_rung_enter(struct vma_slot *slot, struct scan_rung *rung) 4901 | +{ 4902 | + struct scan_rung *old_rung = slot->rung; 4903 | + int err; 4904 | + 4905 | + if (old_rung == rung) 4906 | + return 0; 4907 | + 4908 | + rung_rm_slot(slot); 4909 | + err = rung_add_one_slot(rung, slot); 4910 | + if (err) { 4911 | + err = rung_add_one_slot(old_rung, slot); 4912 | + WARN_ON(err); /* OOPS, badly OOM, we lost this slot */ 4913 | + } 4914 | + 4915 | + return 1; 4916 | +} 4917 | + 4918 | +static inline int vma_rung_up(struct vma_slot *slot) 4919 | +{ 4920 | + struct scan_rung *rung; 4921 | + 4922 | + rung = slot->rung; 4923 | + if (slot->rung != &uksm_scan_ladder[SCAN_LADDER_SIZE-1]) 4924 | + rung++; 4925 | + 4926 | + return vma_rung_enter(slot, rung); 4927 | +} 4928 | + 4929 | +static inline int vma_rung_down(struct vma_slot *slot) 4930 | +{ 4931 | + struct scan_rung *rung; 4932 | + 4933 | + rung = slot->rung; 4934 | + if (slot->rung != &uksm_scan_ladder[0]) 4935 | + rung--; 4936 | + 4937 | + return vma_rung_enter(slot, rung); 4938 | +} 4939 | + 4940 | +/** 4941 | + * cal_dedup_ratio() - Calculate the deduplication ratio for this slot. 4942 | + */ 4943 | +static unsigned long cal_dedup_ratio(struct vma_slot *slot) 4944 | +{ 4945 | + unsigned long ret; 4946 | + unsigned long pages; 4947 | + 4948 | + pages = slot->this_sampled; 4949 | + if (!pages) 4950 | + return 0; 4951 | + 4952 | + BUG_ON(slot->pages_scanned == slot->last_scanned); 4953 | + 4954 | + ret = slot->pages_merged; 4955 | + 4956 | + /* Thrashing area filtering */ 4957 | + if (ret && uksm_thrash_threshold) { 4958 | + if (slot->pages_cowed * 100 / slot->pages_merged 4959 | + > uksm_thrash_threshold) { 4960 | + ret = 0; 4961 | + } else { 4962 | + ret = slot->pages_merged - slot->pages_cowed; 4963 | + } 4964 | + } 4965 | + 4966 | + return ret * 100 / pages; 4967 | +} 4968 | + 4969 | +/** 4970 | + * cal_dedup_ratio() - Calculate the deduplication ratio for this slot. 4971 | + */ 4972 | +static unsigned long cal_dedup_ratio_old(struct vma_slot *slot) 4973 | +{ 4974 | + unsigned long ret; 4975 | + unsigned long pages; 4976 | + 4977 | + pages = slot->pages; 4978 | + if (!pages) 4979 | + return 0; 4980 | + 4981 | + ret = slot->pages_bemerged; 4982 | + 4983 | + /* Thrashing area filtering */ 4984 | + if (ret && uksm_thrash_threshold) { 4985 | + if (slot->pages_cowed * 100 / slot->pages_bemerged 4986 | + > uksm_thrash_threshold) { 4987 | + ret = 0; 4988 | + } else { 4989 | + ret = slot->pages_bemerged - slot->pages_cowed; 4990 | + } 4991 | + } 4992 | + 4993 | + return ret * 100 / pages; 4994 | +} 4995 | + 4996 | +/** 4997 | + * stable_node_reinsert() - When the hash_strength has been adjusted, the 4998 | + * stable tree need to be restructured, this is the function re-inserting the 4999 | + * stable node. 5000 | + */ 5001 | +static inline void stable_node_reinsert(struct stable_node *new_node, 5002 | + struct page *page, 5003 | + struct rb_root *root_treep, 5004 | + struct list_head *tree_node_listp, 5005 | + u32 hash) 5006 | +{ 5007 | + struct rb_node **new = &root_treep->rb_node; 5008 | + struct rb_node *parent = NULL; 5009 | + struct stable_node *stable_node; 5010 | + struct tree_node *tree_node; 5011 | + struct page *tree_page; 5012 | + int cmp; 5013 | + 5014 | + while (*new) { 5015 | + int cmp; 5016 | + 5017 | + tree_node = rb_entry(*new, struct tree_node, node); 5018 | + 5019 | + cmp = hash_cmp(hash, tree_node->hash); 5020 | + 5021 | + if (cmp < 0) { 5022 | + parent = *new; 5023 | + new = &parent->rb_left; 5024 | + } else if (cmp > 0) { 5025 | + parent = *new; 5026 | + new = &parent->rb_right; 5027 | + } else 5028 | + break; 5029 | + } 5030 | + 5031 | + if (*new) { 5032 | + /* find a stable tree node with same first level hash value */ 5033 | + stable_node_hash_max(new_node, page, hash); 5034 | + if (tree_node->count == 1) { 5035 | + stable_node = rb_entry(tree_node->sub_root.rb_node, 5036 | + struct stable_node, node); 5037 | + tree_page = get_uksm_page(stable_node, 1, 0); 5038 | + if (tree_page) { 5039 | + stable_node_hash_max(stable_node, 5040 | + tree_page, hash); 5041 | + put_page(tree_page); 5042 | + 5043 | + /* prepare for stable node insertion */ 5044 | + 5045 | + cmp = hash_cmp(new_node->hash_max, 5046 | + stable_node->hash_max); 5047 | + parent = &stable_node->node; 5048 | + if (cmp < 0) 5049 | + new = &parent->rb_left; 5050 | + else if (cmp > 0) 5051 | + new = &parent->rb_right; 5052 | + else 5053 | + goto failed; 5054 | + 5055 | + goto add_node; 5056 | + } else { 5057 | + /* the only stable_node deleted, the tree node 5058 | + * was not deleted. 5059 | + */ 5060 | + goto tree_node_reuse; 5061 | + } 5062 | + } 5063 | + 5064 | + /* well, search the collision subtree */ 5065 | + new = &tree_node->sub_root.rb_node; 5066 | + parent = NULL; 5067 | + BUG_ON(!*new); 5068 | + while (*new) { 5069 | + int cmp; 5070 | + 5071 | + stable_node = rb_entry(*new, struct stable_node, node); 5072 | + 5073 | + cmp = hash_cmp(new_node->hash_max, 5074 | + stable_node->hash_max); 5075 | + 5076 | + if (cmp < 0) { 5077 | + parent = *new; 5078 | + new = &parent->rb_left; 5079 | + } else if (cmp > 0) { 5080 | + parent = *new; 5081 | + new = &parent->rb_right; 5082 | + } else { 5083 | + /* oh, no, still a collision */ 5084 | + goto failed; 5085 | + } 5086 | + } 5087 | + 5088 | + goto add_node; 5089 | + } 5090 | + 5091 | + /* no tree node found */ 5092 | + tree_node = alloc_tree_node(tree_node_listp); 5093 | + if (!tree_node) { 5094 | + printk(KERN_ERR "UKSM: memory allocation error!\n"); 5095 | + goto failed; 5096 | + } else { 5097 | + tree_node->hash = hash; 5098 | + rb_link_node(&tree_node->node, parent, new); 5099 | + rb_insert_color(&tree_node->node, root_treep); 5100 | + 5101 | +tree_node_reuse: 5102 | + /* prepare for stable node insertion */ 5103 | + parent = NULL; 5104 | + new = &tree_node->sub_root.rb_node; 5105 | + } 5106 | + 5107 | +add_node: 5108 | + rb_link_node(&new_node->node, parent, new); 5109 | + rb_insert_color(&new_node->node, &tree_node->sub_root); 5110 | + new_node->tree_node = tree_node; 5111 | + tree_node->count++; 5112 | + return; 5113 | + 5114 | +failed: 5115 | + /* This can only happen when two nodes have collided 5116 | + * in two levels. 5117 | + */ 5118 | + new_node->tree_node = NULL; 5119 | + return; 5120 | +} 5121 | + 5122 | +static inline void free_all_tree_nodes(struct list_head *list) 5123 | +{ 5124 | + struct tree_node *node, *tmp; 5125 | + 5126 | + list_for_each_entry_safe(node, tmp, list, all_list) { 5127 | + free_tree_node(node); 5128 | + } 5129 | +} 5130 | + 5131 | +/** 5132 | + * stable_tree_delta_hash() - Delta hash the stable tree from previous hash 5133 | + * strength to the current hash_strength. It re-structures the hole tree. 5134 | + */ 5135 | +static inline void stable_tree_delta_hash(u32 prev_hash_strength) 5136 | +{ 5137 | + struct stable_node *node, *tmp; 5138 | + struct rb_root *root_new_treep; 5139 | + struct list_head *new_tree_node_listp; 5140 | + 5141 | + stable_tree_index = (stable_tree_index + 1) % 2; 5142 | + root_new_treep = &root_stable_tree[stable_tree_index]; 5143 | + new_tree_node_listp = &stable_tree_node_list[stable_tree_index]; 5144 | + *root_new_treep = RB_ROOT; 5145 | + BUG_ON(!list_empty(new_tree_node_listp)); 5146 | + 5147 | + /* 5148 | + * we need to be safe, the node could be removed by get_uksm_page() 5149 | + */ 5150 | + list_for_each_entry_safe(node, tmp, &stable_node_list, all_list) { 5151 | + void *addr; 5152 | + struct page *node_page; 5153 | + u32 hash; 5154 | + 5155 | + /* 5156 | + * We are completely re-structuring the stable nodes to a new 5157 | + * stable tree. We don't want to touch the old tree unlinks and 5158 | + * old tree_nodes. The old tree_nodes will be freed at once. 5159 | + */ 5160 | + node_page = get_uksm_page(node, 0, 0); 5161 | + if (!node_page) 5162 | + continue; 5163 | + 5164 | + if (node->tree_node) { 5165 | + hash = node->tree_node->hash; 5166 | + 5167 | + addr = kmap_atomic(node_page); 5168 | + 5169 | + hash = delta_hash(addr, prev_hash_strength, 5170 | + hash_strength, hash); 5171 | + kunmap_atomic(addr); 5172 | + } else { 5173 | + /* 5174 | + *it was not inserted to rbtree due to collision in last 5175 | + *round scan. 5176 | + */ 5177 | + hash = page_hash(node_page, hash_strength, 0); 5178 | + } 5179 | + 5180 | + stable_node_reinsert(node, node_page, root_new_treep, 5181 | + new_tree_node_listp, hash); 5182 | + put_page(node_page); 5183 | + } 5184 | + 5185 | + root_stable_treep = root_new_treep; 5186 | + free_all_tree_nodes(stable_tree_node_listp); 5187 | + BUG_ON(!list_empty(stable_tree_node_listp)); 5188 | + stable_tree_node_listp = new_tree_node_listp; 5189 | +} 5190 | + 5191 | +static inline void inc_hash_strength(unsigned long delta) 5192 | +{ 5193 | + hash_strength += 1 << delta; 5194 | + if (hash_strength > HASH_STRENGTH_MAX) 5195 | + hash_strength = HASH_STRENGTH_MAX; 5196 | +} 5197 | + 5198 | +static inline void dec_hash_strength(unsigned long delta) 5199 | +{ 5200 | + unsigned long change = 1 << delta; 5201 | + 5202 | + if (hash_strength <= change + 1) 5203 | + hash_strength = 1; 5204 | + else 5205 | + hash_strength -= change; 5206 | +} 5207 | + 5208 | +static inline void inc_hash_strength_delta(void) 5209 | +{ 5210 | + hash_strength_delta++; 5211 | + if (hash_strength_delta > HASH_STRENGTH_DELTA_MAX) 5212 | + hash_strength_delta = HASH_STRENGTH_DELTA_MAX; 5213 | +} 5214 | + 5215 | +/* 5216 | +static inline unsigned long get_current_neg_ratio(void) 5217 | +{ 5218 | + if (!rshash_pos || rshash_neg > rshash_pos) 5219 | + return 100; 5220 | + 5221 | + return div64_u64(100 * rshash_neg , rshash_pos); 5222 | +} 5223 | +*/ 5224 | + 5225 | +static inline unsigned long get_current_neg_ratio(void) 5226 | +{ 5227 | + u64 pos = benefit.pos; 5228 | + u64 neg = benefit.neg; 5229 | + 5230 | + if (!neg) 5231 | + return 0; 5232 | + 5233 | + if (!pos || neg > pos) 5234 | + return 100; 5235 | + 5236 | + if (neg > div64_u64(U64_MAX, 100)) 5237 | + pos = div64_u64(pos, 100); 5238 | + else 5239 | + neg *= 100; 5240 | + 5241 | + return div64_u64(neg, pos); 5242 | +} 5243 | + 5244 | +static inline unsigned long get_current_benefit(void) 5245 | +{ 5246 | + u64 pos = benefit.pos; 5247 | + u64 neg = benefit.neg; 5248 | + u64 scanned = benefit.scanned; 5249 | + 5250 | + if (neg > pos) 5251 | + return 0; 5252 | + 5253 | + return div64_u64((pos - neg), scanned); 5254 | +} 5255 | + 5256 | +static inline int judge_rshash_direction(void) 5257 | +{ 5258 | + u64 current_neg_ratio, stable_benefit; 5259 | + u64 current_benefit, delta = 0; 5260 | + int ret = STILL; 5261 | + 5262 | + /* Try to probe a value after the boot, and in case the system 5263 | + are still for a long time. */ 5264 | + if ((fully_scanned_round & 0xFFULL) == 10) { 5265 | + ret = OBSCURE; 5266 | + goto out; 5267 | + } 5268 | + 5269 | + current_neg_ratio = get_current_neg_ratio(); 5270 | + 5271 | + if (current_neg_ratio == 0) { 5272 | + rshash_neg_cont_zero++; 5273 | + if (rshash_neg_cont_zero > 2) 5274 | + return GO_DOWN; 5275 | + else 5276 | + return STILL; 5277 | + } 5278 | + rshash_neg_cont_zero = 0; 5279 | + 5280 | + if (current_neg_ratio > 90) { 5281 | + ret = GO_UP; 5282 | + goto out; 5283 | + } 5284 | + 5285 | + current_benefit = get_current_benefit(); 5286 | + stable_benefit = rshash_state.stable_benefit; 5287 | + 5288 | + if (!stable_benefit) { 5289 | + ret = OBSCURE; 5290 | + goto out; 5291 | + } 5292 | + 5293 | + if (current_benefit > stable_benefit) 5294 | + delta = current_benefit - stable_benefit; 5295 | + else if (current_benefit < stable_benefit) 5296 | + delta = stable_benefit - current_benefit; 5297 | + 5298 | + delta = div64_u64(100 * delta , stable_benefit); 5299 | + 5300 | + if (delta > 50) { 5301 | + rshash_cont_obscure++; 5302 | + if (rshash_cont_obscure > 2) 5303 | + return OBSCURE; 5304 | + else 5305 | + return STILL; 5306 | + } 5307 | + 5308 | +out: 5309 | + rshash_cont_obscure = 0; 5310 | + return ret; 5311 | +} 5312 | + 5313 | +/** 5314 | + * rshash_adjust() - The main function to control the random sampling state 5315 | + * machine for hash strength adapting. 5316 | + * 5317 | + * return true if hash_strength has changed. 5318 | + */ 5319 | +static inline int rshash_adjust(void) 5320 | +{ 5321 | + unsigned long prev_hash_strength = hash_strength; 5322 | + 5323 | + if (!encode_benefit()) 5324 | + return 0; 5325 | + 5326 | + switch (rshash_state.state) { 5327 | + case RSHASH_STILL: 5328 | + switch (judge_rshash_direction()) { 5329 | + case GO_UP: 5330 | + if (rshash_state.pre_direct == GO_DOWN) 5331 | + hash_strength_delta = 0; 5332 | + 5333 | + inc_hash_strength(hash_strength_delta); 5334 | + inc_hash_strength_delta(); 5335 | + rshash_state.stable_benefit = get_current_benefit(); 5336 | + rshash_state.pre_direct = GO_UP; 5337 | + break; 5338 | + 5339 | + case GO_DOWN: 5340 | + if (rshash_state.pre_direct == GO_UP) 5341 | + hash_strength_delta = 0; 5342 | + 5343 | + dec_hash_strength(hash_strength_delta); 5344 | + inc_hash_strength_delta(); 5345 | + rshash_state.stable_benefit = get_current_benefit(); 5346 | + rshash_state.pre_direct = GO_DOWN; 5347 | + break; 5348 | + 5349 | + case OBSCURE: 5350 | + rshash_state.stable_point = hash_strength; 5351 | + rshash_state.turn_point_down = hash_strength; 5352 | + rshash_state.turn_point_up = hash_strength; 5353 | + rshash_state.turn_benefit_down = get_current_benefit(); 5354 | + rshash_state.turn_benefit_up = get_current_benefit(); 5355 | + rshash_state.lookup_window_index = 0; 5356 | + rshash_state.state = RSHASH_TRYDOWN; 5357 | + dec_hash_strength(hash_strength_delta); 5358 | + inc_hash_strength_delta(); 5359 | + break; 5360 | + 5361 | + case STILL: 5362 | + break; 5363 | + default: 5364 | + BUG(); 5365 | + } 5366 | + break; 5367 | + 5368 | + case RSHASH_TRYDOWN: 5369 | + if (rshash_state.lookup_window_index++ % 5 == 0) 5370 | + rshash_state.below_count = 0; 5371 | + 5372 | + if (get_current_benefit() < rshash_state.stable_benefit) 5373 | + rshash_state.below_count++; 5374 | + else if (get_current_benefit() > 5375 | + rshash_state.turn_benefit_down) { 5376 | + rshash_state.turn_point_down = hash_strength; 5377 | + rshash_state.turn_benefit_down = get_current_benefit(); 5378 | + } 5379 | + 5380 | + if (rshash_state.below_count >= 3 || 5381 | + judge_rshash_direction() == GO_UP || 5382 | + hash_strength == 1) { 5383 | + hash_strength = rshash_state.stable_point; 5384 | + hash_strength_delta = 0; 5385 | + inc_hash_strength(hash_strength_delta); 5386 | + inc_hash_strength_delta(); 5387 | + rshash_state.lookup_window_index = 0; 5388 | + rshash_state.state = RSHASH_TRYUP; 5389 | + hash_strength_delta = 0; 5390 | + } else { 5391 | + dec_hash_strength(hash_strength_delta); 5392 | + inc_hash_strength_delta(); 5393 | + } 5394 | + break; 5395 | + 5396 | + case RSHASH_TRYUP: 5397 | + if (rshash_state.lookup_window_index++ % 5 == 0) 5398 | + rshash_state.below_count = 0; 5399 | + 5400 | + if (get_current_benefit() < rshash_state.turn_benefit_down) 5401 | + rshash_state.below_count++; 5402 | + else if (get_current_benefit() > rshash_state.turn_benefit_up) { 5403 | + rshash_state.turn_point_up = hash_strength; 5404 | + rshash_state.turn_benefit_up = get_current_benefit(); 5405 | + } 5406 | + 5407 | + if (rshash_state.below_count >= 3 || 5408 | + judge_rshash_direction() == GO_DOWN || 5409 | + hash_strength == HASH_STRENGTH_MAX) { 5410 | + hash_strength = rshash_state.turn_benefit_up > 5411 | + rshash_state.turn_benefit_down ? 5412 | + rshash_state.turn_point_up : 5413 | + rshash_state.turn_point_down; 5414 | + 5415 | + rshash_state.state = RSHASH_PRE_STILL; 5416 | + } else { 5417 | + inc_hash_strength(hash_strength_delta); 5418 | + inc_hash_strength_delta(); 5419 | + } 5420 | + 5421 | + break; 5422 | + 5423 | + case RSHASH_NEW: 5424 | + case RSHASH_PRE_STILL: 5425 | + rshash_state.stable_benefit = get_current_benefit(); 5426 | + rshash_state.state = RSHASH_STILL; 5427 | + hash_strength_delta = 0; 5428 | + break; 5429 | + default: 5430 | + BUG(); 5431 | + } 5432 | + 5433 | + /* rshash_neg = rshash_pos = 0; */ 5434 | + reset_benefit(); 5435 | + 5436 | + if (prev_hash_strength != hash_strength) 5437 | + stable_tree_delta_hash(prev_hash_strength); 5438 | + 5439 | + return prev_hash_strength != hash_strength; 5440 | +} 5441 | + 5442 | +/** 5443 | + * round_update_ladder() - The main function to do update of all the 5444 | + * adjustments whenever a scan round is finished. 5445 | + */ 5446 | +static noinline void round_update_ladder(void) 5447 | +{ 5448 | + int i; 5449 | + unsigned long dedup; 5450 | + struct vma_slot *slot, *tmp_slot; 5451 | + 5452 | + for (i = 0; i < SCAN_LADDER_SIZE; i++) { 5453 | + uksm_scan_ladder[i].flags &= ~UKSM_RUNG_ROUND_FINISHED; 5454 | + } 5455 | + 5456 | + list_for_each_entry_safe(slot, tmp_slot, &vma_slot_dedup, dedup_list) { 5457 | + 5458 | + /* slot may be rung_rm_slot() when mm exits */ 5459 | + if (slot->snode) { 5460 | + dedup = cal_dedup_ratio_old(slot); 5461 | + if (dedup && dedup >= uksm_abundant_threshold) 5462 | + vma_rung_up(slot); 5463 | + } 5464 | + 5465 | + slot->pages_bemerged = 0; 5466 | + slot->pages_cowed = 0; 5467 | + 5468 | + list_del_init(&slot->dedup_list); 5469 | + } 5470 | +} 5471 | + 5472 | +static void uksm_del_vma_slot(struct vma_slot *slot) 5473 | +{ 5474 | + int i, j; 5475 | + struct rmap_list_entry *entry; 5476 | + 5477 | + if (slot->snode) { 5478 | + /* 5479 | + * In case it just failed when entering the rung, it's not 5480 | + * necessary. 5481 | + */ 5482 | + rung_rm_slot(slot); 5483 | + } 5484 | + 5485 | + if (!list_empty(&slot->dedup_list)) 5486 | + list_del(&slot->dedup_list); 5487 | + 5488 | + if (!slot->rmap_list_pool || !slot->pool_counts) { 5489 | + /* In case it OOMed in uksm_vma_enter() */ 5490 | + goto out; 5491 | + } 5492 | + 5493 | + for (i = 0; i < slot->pool_size; i++) { 5494 | + void *addr; 5495 | + 5496 | + if (!slot->rmap_list_pool[i]) 5497 | + continue; 5498 | + 5499 | + addr = kmap(slot->rmap_list_pool[i]); 5500 | + for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) { 5501 | + entry = (struct rmap_list_entry *)addr + j; 5502 | + if (is_addr(entry->addr)) 5503 | + continue; 5504 | + if (!entry->item) 5505 | + continue; 5506 | + 5507 | + remove_rmap_item_from_tree(entry->item); 5508 | + free_rmap_item(entry->item); 5509 | + slot->pool_counts[i]--; 5510 | + } 5511 | + BUG_ON(slot->pool_counts[i]); 5512 | + kunmap(slot->rmap_list_pool[i]); 5513 | + __free_page(slot->rmap_list_pool[i]); 5514 | + } 5515 | + kfree(slot->rmap_list_pool); 5516 | + kfree(slot->pool_counts); 5517 | + 5518 | +out: 5519 | + slot->rung = NULL; 5520 | + if (slot->flags & UKSM_SLOT_IN_UKSM) { 5521 | + BUG_ON(uksm_pages_total < slot->pages); 5522 | + uksm_pages_total -= slot->pages; 5523 | + } 5524 | + 5525 | + if (slot->fully_scanned_round == fully_scanned_round) 5526 | + scanned_virtual_pages -= slot->pages; 5527 | + else 5528 | + scanned_virtual_pages -= slot->pages_scanned; 5529 | + free_vma_slot(slot); 5530 | +} 5531 | + 5532 | + 5533 | +#define SPIN_LOCK_PERIOD 32 5534 | +static struct vma_slot *cleanup_slots[SPIN_LOCK_PERIOD]; 5535 | +static inline void cleanup_vma_slots(void) 5536 | +{ 5537 | + struct vma_slot *slot; 5538 | + int i; 5539 | + 5540 | + i = 0; 5541 | + spin_lock(&vma_slot_list_lock); 5542 | + while (!list_empty(&vma_slot_del)) { 5543 | + slot = list_entry(vma_slot_del.next, 5544 | + struct vma_slot, slot_list); 5545 | + list_del(&slot->slot_list); 5546 | + cleanup_slots[i++] = slot; 5547 | + if (i == SPIN_LOCK_PERIOD) { 5548 | + spin_unlock(&vma_slot_list_lock); 5549 | + while (--i >= 0) 5550 | + uksm_del_vma_slot(cleanup_slots[i]); 5551 | + i = 0; 5552 | + spin_lock(&vma_slot_list_lock); 5553 | + } 5554 | + } 5555 | + spin_unlock(&vma_slot_list_lock); 5556 | + 5557 | + while (--i >= 0) 5558 | + uksm_del_vma_slot(cleanup_slots[i]); 5559 | +} 5560 | + 5561 | +/* 5562 | +*expotional moving average formula 5563 | +*/ 5564 | +static inline unsigned long ema(unsigned long curr, unsigned long last_ema) 5565 | +{ 5566 | + /* 5567 | + * For a very high burst, even the ema cannot work well, a false very 5568 | + * high per-page time estimation can result in feedback in very high 5569 | + * overhead of context swith and rung update -- this will then lead 5570 | + * to higher per-paper time, this may not converge. 5571 | + * 5572 | + * Instead, we try to approach this value in a binary manner. 5573 | + */ 5574 | + if (curr > last_ema * 10) 5575 | + return last_ema * 2; 5576 | + 5577 | + return (EMA_ALPHA * curr + (100 - EMA_ALPHA) * last_ema) / 100; 5578 | +} 5579 | + 5580 | +/* 5581 | + * convert cpu ratio in 1/TIME_RATIO_SCALE configured by user to 5582 | + * nanoseconds based on current uksm_sleep_jiffies. 5583 | + */ 5584 | +static inline unsigned long cpu_ratio_to_nsec(unsigned int ratio) 5585 | +{ 5586 | + return NSEC_PER_USEC * jiffies_to_usecs(uksm_sleep_jiffies) / 5587 | + (TIME_RATIO_SCALE - ratio) * ratio; 5588 | +} 5589 | + 5590 | + 5591 | +static inline unsigned long rung_real_ratio(int cpu_time_ratio) 5592 | +{ 5593 | + unsigned long ret; 5594 | + 5595 | + BUG_ON(!cpu_time_ratio); 5596 | + 5597 | + if (cpu_time_ratio > 0) 5598 | + ret = cpu_time_ratio; 5599 | + else 5600 | + ret = (unsigned long)(-cpu_time_ratio) * 5601 | + uksm_max_cpu_percentage / 100UL; 5602 | + 5603 | + return ret ? ret : 1; 5604 | +} 5605 | + 5606 | +static noinline void uksm_calc_scan_pages(void) 5607 | +{ 5608 | + struct scan_rung *ladder = uksm_scan_ladder; 5609 | + unsigned long sleep_usecs, nsecs; 5610 | + unsigned long ratio; 5611 | + int i; 5612 | + unsigned long per_page; 5613 | + 5614 | + if (uksm_ema_page_time > 100000 || 5615 | + (((unsigned long) uksm_eval_round & (256UL - 1)) == 0UL)) 5616 | + uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT; 5617 | + 5618 | + per_page = uksm_ema_page_time; 5619 | + BUG_ON(!per_page); 5620 | + 5621 | + /* 5622 | + * For every 8 eval round, we try to probe a uksm_sleep_jiffies value 5623 | + * based on saved user input. 5624 | + */ 5625 | + if (((unsigned long) uksm_eval_round & (8UL - 1)) == 0UL) 5626 | + uksm_sleep_jiffies = uksm_sleep_saved; 5627 | + 5628 | + /* We require a rung scan at least 1 page in a period. */ 5629 | + nsecs = per_page; 5630 | + ratio = rung_real_ratio(ladder[0].cpu_ratio); 5631 | + if (cpu_ratio_to_nsec(ratio) < nsecs) { 5632 | + sleep_usecs = nsecs * (TIME_RATIO_SCALE - ratio) / ratio 5633 | + / NSEC_PER_USEC; 5634 | + uksm_sleep_jiffies = usecs_to_jiffies(sleep_usecs) + 1; 5635 | + } 5636 | + 5637 | + for (i = 0; i < SCAN_LADDER_SIZE; i++) { 5638 | + ratio = rung_real_ratio(ladder[i].cpu_ratio); 5639 | + ladder[i].pages_to_scan = cpu_ratio_to_nsec(ratio) / 5640 | + per_page; 5641 | + BUG_ON(!ladder[i].pages_to_scan); 5642 | + uksm_calc_rung_step(&ladder[i], per_page, ratio); 5643 | + } 5644 | +} 5645 | + 5646 | +/* 5647 | + * From the scan time of this round (ns) to next expected min sleep time 5648 | + * (ms), be careful of the possible overflows. ratio is taken from 5649 | + * rung_real_ratio() 5650 | + */ 5651 | +static inline 5652 | +unsigned int scan_time_to_sleep(unsigned long long scan_time, unsigned long ratio) 5653 | +{ 5654 | + scan_time >>= 20; /* to msec level now */ 5655 | + BUG_ON(scan_time > (ULONG_MAX / TIME_RATIO_SCALE)); 5656 | + 5657 | + return (unsigned int) ((unsigned long) scan_time * 5658 | + (TIME_RATIO_SCALE - ratio) / ratio); 5659 | +} 5660 | + 5661 | +#define __round_mask(x, y) ((__typeof__(x))((y)-1)) 5662 | +#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1) 5663 | + 5664 | +static void uksm_vma_enter(struct vma_slot **slots, unsigned long num) 5665 | +{ 5666 | + struct scan_rung *rung; 5667 | + 5668 | + rung = &uksm_scan_ladder[0]; 5669 | + rung_add_new_slots(rung, slots, num); 5670 | +} 5671 | + 5672 | +static struct vma_slot *batch_slots[SLOT_TREE_NODE_STORE_SIZE]; 5673 | + 5674 | +static void uksm_enter_all_slots(void) 5675 | +{ 5676 | + struct vma_slot *slot; 5677 | + unsigned long index; 5678 | + struct list_head empty_vma_list; 5679 | + int i; 5680 | + 5681 | + i = 0; 5682 | + index = 0; 5683 | + INIT_LIST_HEAD(&empty_vma_list); 5684 | + 5685 | + spin_lock(&vma_slot_list_lock); 5686 | + while (!list_empty(&vma_slot_new)) { 5687 | + slot = list_entry(vma_slot_new.next, 5688 | + struct vma_slot, slot_list); 5689 | + 5690 | + if (!slot->vma->anon_vma) { 5691 | + list_move(&slot->slot_list, &empty_vma_list); 5692 | + } else if (vma_can_enter(slot->vma)) { 5693 | + batch_slots[index++] = slot; 5694 | + list_del_init(&slot->slot_list); 5695 | + } else { 5696 | + list_move(&slot->slot_list, &vma_slot_noadd); 5697 | + } 5698 | + 5699 | + if (++i == SPIN_LOCK_PERIOD || 5700 | + (index && !(index % SLOT_TREE_NODE_STORE_SIZE))) { 5701 | + spin_unlock(&vma_slot_list_lock); 5702 | + 5703 | + if (index && !(index % SLOT_TREE_NODE_STORE_SIZE)) { 5704 | + uksm_vma_enter(batch_slots, index); 5705 | + index = 0; 5706 | + } 5707 | + i = 0; 5708 | + cond_resched(); 5709 | + spin_lock(&vma_slot_list_lock); 5710 | + } 5711 | + } 5712 | + 5713 | + list_splice(&empty_vma_list, &vma_slot_new); 5714 | + 5715 | + spin_unlock(&vma_slot_list_lock); 5716 | + 5717 | + if (index) 5718 | + uksm_vma_enter(batch_slots, index); 5719 | + 5720 | +} 5721 | + 5722 | +static inline int rung_round_finished(struct scan_rung *rung) 5723 | +{ 5724 | + return rung->flags & UKSM_RUNG_ROUND_FINISHED; 5725 | +} 5726 | + 5727 | +static inline void judge_slot(struct vma_slot *slot) 5728 | +{ 5729 | + struct scan_rung *rung = slot->rung; 5730 | + unsigned long dedup; 5731 | + int deleted; 5732 | + 5733 | + dedup = cal_dedup_ratio(slot); 5734 | + if (vma_fully_scanned(slot) && uksm_thrash_threshold) 5735 | + deleted = vma_rung_enter(slot, &uksm_scan_ladder[0]); 5736 | + else if (dedup && dedup >= uksm_abundant_threshold) 5737 | + deleted = vma_rung_up(slot); 5738 | + else 5739 | + deleted = vma_rung_down(slot); 5740 | + 5741 | + slot->pages_merged = 0; 5742 | + slot->pages_cowed = 0; 5743 | + slot->this_sampled = 0; 5744 | + 5745 | + if (vma_fully_scanned(slot)) { 5746 | + slot->pages_scanned = 0; 5747 | + } 5748 | + 5749 | + slot->last_scanned = slot->pages_scanned; 5750 | + 5751 | + /* If its deleted in above, then rung was already advanced. */ 5752 | + if (!deleted) 5753 | + advance_current_scan(rung); 5754 | +} 5755 | + 5756 | + 5757 | +static inline int hash_round_finished(void) 5758 | +{ 5759 | + if (scanned_virtual_pages > (uksm_pages_total >> 2)) { 5760 | + scanned_virtual_pages = 0; 5761 | + if (uksm_pages_scanned) 5762 | + fully_scanned_round++; 5763 | + 5764 | + return 1; 5765 | + } else { 5766 | + return 0; 5767 | + } 5768 | +} 5769 | + 5770 | +#define UKSM_MMSEM_BATCH 5 5771 | +#define BUSY_RETRY 100 5772 | + 5773 | +/** 5774 | + * uksm_do_scan() - the main worker function. 5775 | + */ 5776 | +static noinline void uksm_do_scan(void) 5777 | +{ 5778 | + struct vma_slot *slot, *iter; 5779 | + struct mm_struct *busy_mm; 5780 | + unsigned char round_finished, all_rungs_emtpy; 5781 | + int i, err, mmsem_batch; 5782 | + unsigned long pcost; 5783 | + long long delta_exec; 5784 | + unsigned long vpages, max_cpu_ratio; 5785 | + unsigned long long start_time, end_time, scan_time; 5786 | + unsigned int expected_jiffies; 5787 | + 5788 | + might_sleep(); 5789 | + 5790 | + vpages = 0; 5791 | + 5792 | + start_time = task_sched_runtime(current); 5793 | + max_cpu_ratio = 0; 5794 | + mmsem_batch = 0; 5795 | + 5796 | + for (i = 0; i < SCAN_LADDER_SIZE;) { 5797 | + struct scan_rung *rung = &uksm_scan_ladder[i]; 5798 | + unsigned long ratio; 5799 | + int busy_retry; 5800 | + 5801 | + if (!rung->pages_to_scan) { 5802 | + i++; 5803 | + continue; 5804 | + } 5805 | + 5806 | + if (!rung->vma_root.num) { 5807 | + rung->pages_to_scan = 0; 5808 | + i++; 5809 | + continue; 5810 | + } 5811 | + 5812 | + ratio = rung_real_ratio(rung->cpu_ratio); 5813 | + if (ratio > max_cpu_ratio) 5814 | + max_cpu_ratio = ratio; 5815 | + 5816 | + busy_retry = BUSY_RETRY; 5817 | + /* 5818 | + * Do not consider rung_round_finished() here, just used up the 5819 | + * rung->pages_to_scan quota. 5820 | + */ 5821 | + while (rung->pages_to_scan && rung->vma_root.num && 5822 | + likely(!freezing(current))) { 5823 | + int reset = 0; 5824 | + 5825 | + slot = rung->current_scan; 5826 | + 5827 | + BUG_ON(vma_fully_scanned(slot)); 5828 | + 5829 | + if (mmsem_batch) { 5830 | + err = 0; 5831 | + } else { 5832 | + err = try_down_read_slot_mmap_sem(slot); 5833 | + } 5834 | + 5835 | + if (err == -ENOENT) { 5836 | +rm_slot: 5837 | + rung_rm_slot(slot); 5838 | + continue; 5839 | + } 5840 | + 5841 | + busy_mm = slot->mm; 5842 | + 5843 | + if (err == -EBUSY) { 5844 | + /* skip other vmas on the same mm */ 5845 | + do { 5846 | + reset = advance_current_scan(rung); 5847 | + iter = rung->current_scan; 5848 | + busy_retry--; 5849 | + if (iter->vma->vm_mm != busy_mm || 5850 | + !busy_retry || reset) 5851 | + break; 5852 | + } while (1); 5853 | + 5854 | + if (iter->vma->vm_mm != busy_mm) { 5855 | + continue; 5856 | + } else { 5857 | + /* scan round finsished */ 5858 | + break; 5859 | + } 5860 | + } 5861 | + 5862 | + BUG_ON(!vma_can_enter(slot->vma)); 5863 | + if (uksm_test_exit(slot->vma->vm_mm)) { 5864 | + mmsem_batch = 0; 5865 | + up_read(&slot->vma->vm_mm->mmap_sem); 5866 | + goto rm_slot; 5867 | + } 5868 | + 5869 | + if (mmsem_batch) 5870 | + mmsem_batch--; 5871 | + else 5872 | + mmsem_batch = UKSM_MMSEM_BATCH; 5873 | + 5874 | + /* Ok, we have take the mmap_sem, ready to scan */ 5875 | + scan_vma_one_page(slot); 5876 | + rung->pages_to_scan--; 5877 | + vpages++; 5878 | + 5879 | + if (rung->current_offset + rung->step > slot->pages - 1 5880 | + || vma_fully_scanned(slot)) { 5881 | + up_read(&slot->vma->vm_mm->mmap_sem); 5882 | + judge_slot(slot); 5883 | + mmsem_batch = 0; 5884 | + } else { 5885 | + rung->current_offset += rung->step; 5886 | + if (!mmsem_batch) 5887 | + up_read(&slot->vma->vm_mm->mmap_sem); 5888 | + } 5889 | + 5890 | + busy_retry = BUSY_RETRY; 5891 | + cond_resched(); 5892 | + } 5893 | + 5894 | + if (mmsem_batch) { 5895 | + up_read(&slot->vma->vm_mm->mmap_sem); 5896 | + mmsem_batch = 0; 5897 | + } 5898 | + 5899 | + if (freezing(current)) 5900 | + break; 5901 | + 5902 | + cond_resched(); 5903 | + } 5904 | + end_time = task_sched_runtime(current); 5905 | + delta_exec = end_time - start_time; 5906 | + 5907 | + if (freezing(current)) 5908 | + return; 5909 | + 5910 | + cleanup_vma_slots(); 5911 | + uksm_enter_all_slots(); 5912 | + 5913 | + round_finished = 1; 5914 | + all_rungs_emtpy = 1; 5915 | + for (i = 0; i < SCAN_LADDER_SIZE; i++) { 5916 | + struct scan_rung *rung = &uksm_scan_ladder[i]; 5917 | + 5918 | + if (rung->vma_root.num) { 5919 | + all_rungs_emtpy = 0; 5920 | + if (!rung_round_finished(rung)) 5921 | + round_finished = 0; 5922 | + } 5923 | + } 5924 | + 5925 | + if (all_rungs_emtpy) 5926 | + round_finished = 0; 5927 | + 5928 | + if (round_finished) { 5929 | + round_update_ladder(); 5930 | + uksm_eval_round++; 5931 | + 5932 | + if (hash_round_finished() && rshash_adjust()) { 5933 | + /* Reset the unstable root iff hash strength changed */ 5934 | + uksm_hash_round++; 5935 | + root_unstable_tree = RB_ROOT; 5936 | + free_all_tree_nodes(&unstable_tree_node_list); 5937 | + } 5938 | + 5939 | + /* 5940 | + * A number of pages can hang around indefinitely on per-cpu 5941 | + * pagevecs, raised page count preventing write_protect_page 5942 | + * from merging them. Though it doesn't really matter much, 5943 | + * it is puzzling to see some stuck in pages_volatile until 5944 | + * other activity jostles them out, and they also prevented 5945 | + * LTP's KSM test from succeeding deterministically; so drain 5946 | + * them here (here rather than on entry to uksm_do_scan(), 5947 | + * so we don't IPI too often when pages_to_scan is set low). 5948 | + */ 5949 | + lru_add_drain_all(); 5950 | + } 5951 | + 5952 | + 5953 | + if (vpages && delta_exec > 0) { 5954 | + pcost = (unsigned long) delta_exec / vpages; 5955 | + if (likely(uksm_ema_page_time)) 5956 | + uksm_ema_page_time = ema(pcost, uksm_ema_page_time); 5957 | + else 5958 | + uksm_ema_page_time = pcost; 5959 | + } 5960 | + 5961 | + uksm_calc_scan_pages(); 5962 | + uksm_sleep_real = uksm_sleep_jiffies; 5963 | + /* in case of radical cpu bursts, apply the upper bound */ 5964 | + end_time = task_sched_runtime(current); 5965 | + if (max_cpu_ratio && end_time > start_time) { 5966 | + scan_time = end_time - start_time; 5967 | + expected_jiffies = msecs_to_jiffies( 5968 | + scan_time_to_sleep(scan_time, max_cpu_ratio)); 5969 | + 5970 | + if (expected_jiffies > uksm_sleep_real) 5971 | + uksm_sleep_real = expected_jiffies; 5972 | + 5973 | + /* We have a 1 second up bound for responsiveness. */ 5974 | + if (jiffies_to_msecs(uksm_sleep_real) > MSEC_PER_SEC) 5975 | + uksm_sleep_real = msecs_to_jiffies(1000); 5976 | + } 5977 | + 5978 | + return; 5979 | +} 5980 | + 5981 | +static int ksmd_should_run(void) 5982 | +{ 5983 | + return uksm_run & UKSM_RUN_MERGE; 5984 | +} 5985 | + 5986 | +static int uksm_scan_thread(void *nothing) 5987 | +{ 5988 | + set_freezable(); 5989 | + set_user_nice(current, 5); 5990 | + 5991 | + while (!kthread_should_stop()) { 5992 | + mutex_lock(&uksm_thread_mutex); 5993 | + if (ksmd_should_run()) { 5994 | + uksm_do_scan(); 5995 | + } 5996 | + mutex_unlock(&uksm_thread_mutex); 5997 | + 5998 | + try_to_freeze(); 5999 | + 6000 | + if (ksmd_should_run()) { 6001 | + schedule_timeout_interruptible(uksm_sleep_real); 6002 | + uksm_sleep_times++; 6003 | + } else { 6004 | + wait_event_freezable(uksm_thread_wait, 6005 | + ksmd_should_run() || kthread_should_stop()); 6006 | + } 6007 | + } 6008 | + return 0; 6009 | +} 6010 | + 6011 | +int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) 6012 | +{ 6013 | + struct stable_node *stable_node; 6014 | + struct node_vma *node_vma; 6015 | + struct rmap_item *rmap_item; 6016 | + int ret = SWAP_AGAIN; 6017 | + int search_new_forks = 0; 6018 | + unsigned long address; 6019 | + 6020 | + VM_BUG_ON_PAGE(!PageKsm(page), page); 6021 | + VM_BUG_ON_PAGE(!PageLocked(page), page); 6022 | + 6023 | + stable_node = page_stable_node(page); 6024 | + if (!stable_node) 6025 | + return ret; 6026 | +again: 6027 | + hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) { 6028 | + hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) { 6029 | + struct anon_vma *anon_vma = rmap_item->anon_vma; 6030 | + struct anon_vma_chain *vmac; 6031 | + struct vm_area_struct *vma; 6032 | + 6033 | + cond_resched(); 6034 | + anon_vma_lock_read(anon_vma); 6035 | + anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 6036 | + 0, ULONG_MAX) { 6037 | + cond_resched(); 6038 | + vma = vmac->vma; 6039 | + address = get_rmap_addr(rmap_item); 6040 | + 6041 | + if (address < vma->vm_start || 6042 | + address >= vma->vm_end) 6043 | + continue; 6044 | + 6045 | + if ((rmap_item->slot->vma == vma) == 6046 | + search_new_forks) 6047 | + continue; 6048 | + 6049 | + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) 6050 | + continue; 6051 | + 6052 | + ret = rwc->rmap_one(page, vma, address, rwc->arg); 6053 | + if (ret != SWAP_AGAIN) { 6054 | + anon_vma_unlock_read(anon_vma); 6055 | + goto out; 6056 | + } 6057 | + 6058 | + if (rwc->done && rwc->done(page)) { 6059 | + anon_vma_unlock_read(anon_vma); 6060 | + goto out; 6061 | + } 6062 | + } 6063 | + anon_vma_unlock_read(anon_vma); 6064 | + } 6065 | + } 6066 | + if (!search_new_forks++) 6067 | + goto again; 6068 | +out: 6069 | + return ret; 6070 | +} 6071 | + 6072 | +#ifdef CONFIG_MIGRATION 6073 | +/* Common ksm interface but may be specific to uksm */ 6074 | +void ksm_migrate_page(struct page *newpage, struct page *oldpage) 6075 | +{ 6076 | + struct stable_node *stable_node; 6077 | + 6078 | + VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); 6079 | + VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); 6080 | + VM_BUG_ON(newpage->mapping != oldpage->mapping); 6081 | + 6082 | + stable_node = page_stable_node(newpage); 6083 | + if (stable_node) { 6084 | + VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); 6085 | + stable_node->kpfn = page_to_pfn(newpage); 6086 | + /* 6087 | + * newpage->mapping was set in advance; now we need smp_wmb() 6088 | + * to make sure that the new stable_node->kpfn is visible 6089 | + * to get_ksm_page() before it can see that oldpage->mapping 6090 | + * has gone stale (or that PageSwapCache has been cleared). 6091 | + */ 6092 | + smp_wmb(); 6093 | + set_page_stable_node(oldpage, NULL); 6094 | + } 6095 | +} 6096 | +#endif /* CONFIG_MIGRATION */ 6097 | + 6098 | +#ifdef CONFIG_MEMORY_HOTREMOVE 6099 | +static struct stable_node *uksm_check_stable_tree(unsigned long start_pfn, 6100 | + unsigned long end_pfn) 6101 | +{ 6102 | + struct rb_node *node; 6103 | + 6104 | + for (node = rb_first(root_stable_treep); node; node = rb_next(node)) { 6105 | + struct stable_node *stable_node; 6106 | + 6107 | + stable_node = rb_entry(node, struct stable_node, node); 6108 | + if (stable_node->kpfn >= start_pfn && 6109 | + stable_node->kpfn < end_pfn) 6110 | + return stable_node; 6111 | + } 6112 | + return NULL; 6113 | +} 6114 | + 6115 | +static int uksm_memory_callback(struct notifier_block *self, 6116 | + unsigned long action, void *arg) 6117 | +{ 6118 | + struct memory_notify *mn = arg; 6119 | + struct stable_node *stable_node; 6120 | + 6121 | + switch (action) { 6122 | + case MEM_GOING_OFFLINE: 6123 | + /* 6124 | + * Keep it very simple for now: just lock out ksmd and 6125 | + * MADV_UNMERGEABLE while any memory is going offline. 6126 | + * mutex_lock_nested() is necessary because lockdep was alarmed 6127 | + * that here we take uksm_thread_mutex inside notifier chain 6128 | + * mutex, and later take notifier chain mutex inside 6129 | + * uksm_thread_mutex to unlock it. But that's safe because both 6130 | + * are inside mem_hotplug_mutex. 6131 | + */ 6132 | + mutex_lock_nested(&uksm_thread_mutex, SINGLE_DEPTH_NESTING); 6133 | + break; 6134 | + 6135 | + case MEM_OFFLINE: 6136 | + /* 6137 | + * Most of the work is done by page migration; but there might 6138 | + * be a few stable_nodes left over, still pointing to struct 6139 | + * pages which have been offlined: prune those from the tree. 6140 | + */ 6141 | + while ((stable_node = uksm_check_stable_tree(mn->start_pfn, 6142 | + mn->start_pfn + mn->nr_pages)) != NULL) 6143 | + remove_node_from_stable_tree(stable_node, 1, 1); 6144 | + /* fallthrough */ 6145 | + 6146 | + case MEM_CANCEL_OFFLINE: 6147 | + mutex_unlock(&uksm_thread_mutex); 6148 | + break; 6149 | + } 6150 | + return NOTIFY_OK; 6151 | +} 6152 | +#endif /* CONFIG_MEMORY_HOTREMOVE */ 6153 | + 6154 | +#ifdef CONFIG_SYSFS 6155 | +/* 6156 | + * This all compiles without CONFIG_SYSFS, but is a waste of space. 6157 | + */ 6158 | + 6159 | +#define UKSM_ATTR_RO(_name) \ 6160 | + static struct kobj_attribute _name##_attr = __ATTR_RO(_name) 6161 | +#define UKSM_ATTR(_name) \ 6162 | + static struct kobj_attribute _name##_attr = \ 6163 | + __ATTR(_name, 0644, _name##_show, _name##_store) 6164 | + 6165 | +static ssize_t max_cpu_percentage_show(struct kobject *kobj, 6166 | + struct kobj_attribute *attr, char *buf) 6167 | +{ 6168 | + return sprintf(buf, "%u\n", uksm_max_cpu_percentage); 6169 | +} 6170 | + 6171 | +static ssize_t max_cpu_percentage_store(struct kobject *kobj, 6172 | + struct kobj_attribute *attr, 6173 | + const char *buf, size_t count) 6174 | +{ 6175 | + unsigned long max_cpu_percentage; 6176 | + int err; 6177 | + 6178 | + err = kstrtoul(buf, 10, &max_cpu_percentage); 6179 | + if (err || max_cpu_percentage > 100) 6180 | + return -EINVAL; 6181 | + 6182 | + if (max_cpu_percentage == 100) 6183 | + max_cpu_percentage = 99; 6184 | + else if (max_cpu_percentage < 10) 6185 | + max_cpu_percentage = 10; 6186 | + 6187 | + uksm_max_cpu_percentage = max_cpu_percentage; 6188 | + 6189 | + return count; 6190 | +} 6191 | +UKSM_ATTR(max_cpu_percentage); 6192 | + 6193 | +static ssize_t sleep_millisecs_show(struct kobject *kobj, 6194 | + struct kobj_attribute *attr, char *buf) 6195 | +{ 6196 | + return sprintf(buf, "%u\n", jiffies_to_msecs(uksm_sleep_jiffies)); 6197 | +} 6198 | + 6199 | +static ssize_t sleep_millisecs_store(struct kobject *kobj, 6200 | + struct kobj_attribute *attr, 6201 | + const char *buf, size_t count) 6202 | +{ 6203 | + unsigned long msecs; 6204 | + int err; 6205 | + 6206 | + err = kstrtoul(buf, 10, &msecs); 6207 | + if (err || msecs > MSEC_PER_SEC) 6208 | + return -EINVAL; 6209 | + 6210 | + uksm_sleep_jiffies = msecs_to_jiffies(msecs); 6211 | + uksm_sleep_saved = uksm_sleep_jiffies; 6212 | + 6213 | + return count; 6214 | +} 6215 | +UKSM_ATTR(sleep_millisecs); 6216 | + 6217 | + 6218 | +static ssize_t cpu_governor_show(struct kobject *kobj, 6219 | + struct kobj_attribute *attr, char *buf) 6220 | +{ 6221 | + int n = sizeof(uksm_cpu_governor_str) / sizeof(char *); 6222 | + int i; 6223 | + 6224 | + buf[0] = '\0'; 6225 | + for (i = 0; i < n ; i++) { 6226 | + if (uksm_cpu_governor == i) 6227 | + strcat(buf, "["); 6228 | + 6229 | + strcat(buf, uksm_cpu_governor_str[i]); 6230 | + 6231 | + if (uksm_cpu_governor == i) 6232 | + strcat(buf, "]"); 6233 | + 6234 | + strcat(buf, " "); 6235 | + } 6236 | + strcat(buf, "\n"); 6237 | + 6238 | + return strlen(buf); 6239 | +} 6240 | + 6241 | +static inline void init_performance_values(void) 6242 | +{ 6243 | + int i; 6244 | + struct scan_rung *rung; 6245 | + struct uksm_cpu_preset_s *preset = uksm_cpu_preset + uksm_cpu_governor; 6246 | + 6247 | + 6248 | + for (i = 0; i < SCAN_LADDER_SIZE; i++) { 6249 | + rung = uksm_scan_ladder + i; 6250 | + rung->cpu_ratio = preset->cpu_ratio[i]; 6251 | + rung->cover_msecs = preset->cover_msecs[i]; 6252 | + } 6253 | + 6254 | + uksm_max_cpu_percentage = preset->max_cpu; 6255 | +} 6256 | + 6257 | +static ssize_t cpu_governor_store(struct kobject *kobj, 6258 | + struct kobj_attribute *attr, 6259 | + const char *buf, size_t count) 6260 | +{ 6261 | + int n = sizeof(uksm_cpu_governor_str) / sizeof(char *); 6262 | + 6263 | + for (n--; n >=0 ; n--) { 6264 | + if (!strncmp(buf, uksm_cpu_governor_str[n], 6265 | + strlen(uksm_cpu_governor_str[n]))) 6266 | + break; 6267 | + } 6268 | + 6269 | + if (n < 0) 6270 | + return -EINVAL; 6271 | + else 6272 | + uksm_cpu_governor = n; 6273 | + 6274 | + init_performance_values(); 6275 | + 6276 | + return count; 6277 | +} 6278 | +UKSM_ATTR(cpu_governor); 6279 | + 6280 | +static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, 6281 | + char *buf) 6282 | +{ 6283 | + return sprintf(buf, "%u\n", uksm_run); 6284 | +} 6285 | + 6286 | +static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, 6287 | + const char *buf, size_t count) 6288 | +{ 6289 | + int err; 6290 | + unsigned long flags; 6291 | + 6292 | + err = kstrtoul(buf, 10, &flags); 6293 | + if (err || flags > UINT_MAX) 6294 | + return -EINVAL; 6295 | + if (flags > UKSM_RUN_MERGE) 6296 | + return -EINVAL; 6297 | + 6298 | + mutex_lock(&uksm_thread_mutex); 6299 | + if (uksm_run != flags) { 6300 | + uksm_run = flags; 6301 | + } 6302 | + mutex_unlock(&uksm_thread_mutex); 6303 | + 6304 | + if (flags & UKSM_RUN_MERGE) 6305 | + wake_up_interruptible(&uksm_thread_wait); 6306 | + 6307 | + return count; 6308 | +} 6309 | +UKSM_ATTR(run); 6310 | + 6311 | +static ssize_t abundant_threshold_show(struct kobject *kobj, 6312 | + struct kobj_attribute *attr, char *buf) 6313 | +{ 6314 | + return sprintf(buf, "%u\n", uksm_abundant_threshold); 6315 | +} 6316 | + 6317 | +static ssize_t abundant_threshold_store(struct kobject *kobj, 6318 | + struct kobj_attribute *attr, 6319 | + const char *buf, size_t count) 6320 | +{ 6321 | + int err; 6322 | + unsigned long flags; 6323 | + 6324 | + err = kstrtoul(buf, 10, &flags); 6325 | + if (err || flags > 99) 6326 | + return -EINVAL; 6327 | + 6328 | + uksm_abundant_threshold = flags; 6329 | + 6330 | + return count; 6331 | +} 6332 | +UKSM_ATTR(abundant_threshold); 6333 | + 6334 | +static ssize_t thrash_threshold_show(struct kobject *kobj, 6335 | + struct kobj_attribute *attr, char *buf) 6336 | +{ 6337 | + return sprintf(buf, "%u\n", uksm_thrash_threshold); 6338 | +} 6339 | + 6340 | +static ssize_t thrash_threshold_store(struct kobject *kobj, 6341 | + struct kobj_attribute *attr, 6342 | + const char *buf, size_t count) 6343 | +{ 6344 | + int err; 6345 | + unsigned long flags; 6346 | + 6347 | + err = kstrtoul(buf, 10, &flags); 6348 | + if (err || flags > 99) 6349 | + return -EINVAL; 6350 | + 6351 | + uksm_thrash_threshold = flags; 6352 | + 6353 | + return count; 6354 | +} 6355 | +UKSM_ATTR(thrash_threshold); 6356 | + 6357 | +static ssize_t cpu_ratios_show(struct kobject *kobj, 6358 | + struct kobj_attribute *attr, char *buf) 6359 | +{ 6360 | + int i, size; 6361 | + struct scan_rung *rung; 6362 | + char *p = buf; 6363 | + 6364 | + for (i = 0; i < SCAN_LADDER_SIZE; i++) { 6365 | + rung = &uksm_scan_ladder[i]; 6366 | + 6367 | + if (rung->cpu_ratio > 0) 6368 | + size = sprintf(p, "%d ", rung->cpu_ratio); 6369 | + else 6370 | + size = sprintf(p, "MAX/%d ", 6371 | + TIME_RATIO_SCALE / -rung->cpu_ratio); 6372 | + 6373 | + p += size; 6374 | + } 6375 | + 6376 | + *p++ = '\n'; 6377 | + *p = '\0'; 6378 | + 6379 | + return p - buf; 6380 | +} 6381 | + 6382 | +static ssize_t cpu_ratios_store(struct kobject *kobj, 6383 | + struct kobj_attribute *attr, 6384 | + const char *buf, size_t count) 6385 | +{ 6386 | + int i, cpuratios[SCAN_LADDER_SIZE], err; 6387 | + unsigned long value; 6388 | + struct scan_rung *rung; 6389 | + char *p, *end = NULL; 6390 | + 6391 | + p = kzalloc(count, GFP_KERNEL); 6392 | + if (!p) 6393 | + return -ENOMEM; 6394 | + 6395 | + memcpy(p, buf, count); 6396 | + 6397 | + for (i = 0; i < SCAN_LADDER_SIZE; i++) { 6398 | + if (i != SCAN_LADDER_SIZE -1) { 6399 | + end = strchr(p, ' '); 6400 | + if (!end) 6401 | + return -EINVAL; 6402 | + 6403 | + *end = '\0'; 6404 | + } 6405 | + 6406 | + if (strstr(p, "MAX/")) { 6407 | + p = strchr(p, '/') + 1; 6408 | + err = kstrtoul(p, 10, &value); 6409 | + if (err || value > TIME_RATIO_SCALE || !value) 6410 | + return -EINVAL; 6411 | + 6412 | + cpuratios[i] = - (int) (TIME_RATIO_SCALE / value); 6413 | + } else { 6414 | + err = kstrtoul(p, 10, &value); 6415 | + if (err || value > TIME_RATIO_SCALE || !value) 6416 | + return -EINVAL; 6417 | + 6418 | + cpuratios[i] = value; 6419 | + } 6420 | + 6421 | + p = end + 1; 6422 | + } 6423 | + 6424 | + for (i = 0; i < SCAN_LADDER_SIZE; i++) { 6425 | + rung = &uksm_scan_ladder[i]; 6426 | + 6427 | + rung->cpu_ratio = cpuratios[i]; 6428 | + } 6429 | + 6430 | + return count; 6431 | +} 6432 | +UKSM_ATTR(cpu_ratios); 6433 | + 6434 | +static ssize_t eval_intervals_show(struct kobject *kobj, 6435 | + struct kobj_attribute *attr, char *buf) 6436 | +{ 6437 | + int i, size; 6438 | + struct scan_rung *rung; 6439 | + char *p = buf; 6440 | + 6441 | + for (i = 0; i < SCAN_LADDER_SIZE; i++) { 6442 | + rung = &uksm_scan_ladder[i]; 6443 | + size = sprintf(p, "%u ", rung->cover_msecs); 6444 | + p += size; 6445 | + } 6446 | + 6447 | + *p++ = '\n'; 6448 | + *p = '\0'; 6449 | + 6450 | + return p - buf; 6451 | +} 6452 | + 6453 | +static ssize_t eval_intervals_store(struct kobject *kobj, 6454 | + struct kobj_attribute *attr, 6455 | + const char *buf, size_t count) 6456 | +{ 6457 | + int i, err; 6458 | + unsigned long values[SCAN_LADDER_SIZE]; 6459 | + struct scan_rung *rung; 6460 | + char *p, *end = NULL; 6461 | + ssize_t ret = count; 6462 | + 6463 | + p = kzalloc(count + 2, GFP_KERNEL); 6464 | + if (!p) 6465 | + return -ENOMEM; 6466 | + 6467 | + memcpy(p, buf, count); 6468 | + 6469 | + for (i = 0; i < SCAN_LADDER_SIZE; i++) { 6470 | + if (i != SCAN_LADDER_SIZE -1) { 6471 | + end = strchr(p, ' '); 6472 | + if (!end) { 6473 | + ret = -EINVAL; 6474 | + goto out; 6475 | + } 6476 | + 6477 | + *end = '\0'; 6478 | + } 6479 | + 6480 | + err = kstrtoul(p, 10, &values[i]); 6481 | + if (err) { 6482 | + ret = -EINVAL; 6483 | + goto out; 6484 | + } 6485 | + 6486 | + p = end + 1; 6487 | + } 6488 | + 6489 | + for (i = 0; i < SCAN_LADDER_SIZE; i++) { 6490 | + rung = &uksm_scan_ladder[i]; 6491 | + 6492 | + rung->cover_msecs = values[i]; 6493 | + } 6494 | + 6495 | +out: 6496 | + kfree(p); 6497 | + return ret; 6498 | +} 6499 | +UKSM_ATTR(eval_intervals); 6500 | + 6501 | +static ssize_t ema_per_page_time_show(struct kobject *kobj, 6502 | + struct kobj_attribute *attr, char *buf) 6503 | +{ 6504 | + return sprintf(buf, "%lu\n", uksm_ema_page_time); 6505 | +} 6506 | +UKSM_ATTR_RO(ema_per_page_time); 6507 | + 6508 | +static ssize_t pages_shared_show(struct kobject *kobj, 6509 | + struct kobj_attribute *attr, char *buf) 6510 | +{ 6511 | + return sprintf(buf, "%lu\n", uksm_pages_shared); 6512 | +} 6513 | +UKSM_ATTR_RO(pages_shared); 6514 | + 6515 | +static ssize_t pages_sharing_show(struct kobject *kobj, 6516 | + struct kobj_attribute *attr, char *buf) 6517 | +{ 6518 | + return sprintf(buf, "%lu\n", uksm_pages_sharing); 6519 | +} 6520 | +UKSM_ATTR_RO(pages_sharing); 6521 | + 6522 | +static ssize_t pages_unshared_show(struct kobject *kobj, 6523 | + struct kobj_attribute *attr, char *buf) 6524 | +{ 6525 | + return sprintf(buf, "%lu\n", uksm_pages_unshared); 6526 | +} 6527 | +UKSM_ATTR_RO(pages_unshared); 6528 | + 6529 | +static ssize_t full_scans_show(struct kobject *kobj, 6530 | + struct kobj_attribute *attr, char *buf) 6531 | +{ 6532 | + return sprintf(buf, "%llu\n", fully_scanned_round); 6533 | +} 6534 | +UKSM_ATTR_RO(full_scans); 6535 | + 6536 | +static ssize_t pages_scanned_show(struct kobject *kobj, 6537 | + struct kobj_attribute *attr, char *buf) 6538 | +{ 6539 | + unsigned long base = 0; 6540 | + u64 delta, ret; 6541 | + 6542 | + if (pages_scanned_stored) { 6543 | + base = pages_scanned_base; 6544 | + ret = pages_scanned_stored; 6545 | + delta = uksm_pages_scanned >> base; 6546 | + if (CAN_OVERFLOW_U64(ret, delta)) { 6547 | + ret >>= 1; 6548 | + delta >>= 1; 6549 | + base++; 6550 | + ret += delta; 6551 | + } 6552 | + } else { 6553 | + ret = uksm_pages_scanned; 6554 | + } 6555 | + 6556 | + while (ret > ULONG_MAX) { 6557 | + ret >>= 1; 6558 | + base++; 6559 | + } 6560 | + 6561 | + if (base) 6562 | + return sprintf(buf, "%lu * 2^%lu\n", (unsigned long)ret, base); 6563 | + else 6564 | + return sprintf(buf, "%lu\n", (unsigned long)ret); 6565 | +} 6566 | +UKSM_ATTR_RO(pages_scanned); 6567 | + 6568 | +static ssize_t hash_strength_show(struct kobject *kobj, 6569 | + struct kobj_attribute *attr, char *buf) 6570 | +{ 6571 | + return sprintf(buf, "%lu\n", hash_strength); 6572 | +} 6573 | +UKSM_ATTR_RO(hash_strength); 6574 | + 6575 | +static ssize_t sleep_times_show(struct kobject *kobj, 6576 | + struct kobj_attribute *attr, char *buf) 6577 | +{ 6578 | + return sprintf(buf, "%llu\n", uksm_sleep_times); 6579 | +} 6580 | +UKSM_ATTR_RO(sleep_times); 6581 | + 6582 | + 6583 | +static struct attribute *uksm_attrs[] = { 6584 | + &max_cpu_percentage_attr.attr, 6585 | + &sleep_millisecs_attr.attr, 6586 | + &cpu_governor_attr.attr, 6587 | + &run_attr.attr, 6588 | + &ema_per_page_time_attr.attr, 6589 | + &pages_shared_attr.attr, 6590 | + &pages_sharing_attr.attr, 6591 | + &pages_unshared_attr.attr, 6592 | + &full_scans_attr.attr, 6593 | + &pages_scanned_attr.attr, 6594 | + &hash_strength_attr.attr, 6595 | + &sleep_times_attr.attr, 6596 | + &thrash_threshold_attr.attr, 6597 | + &abundant_threshold_attr.attr, 6598 | + &cpu_ratios_attr.attr, 6599 | + &eval_intervals_attr.attr, 6600 | + NULL, 6601 | +}; 6602 | + 6603 | +static struct attribute_group uksm_attr_group = { 6604 | + .attrs = uksm_attrs, 6605 | + .name = "uksm", 6606 | +}; 6607 | +#endif /* CONFIG_SYSFS */ 6608 | + 6609 | +static inline void init_scan_ladder(void) 6610 | +{ 6611 | + int i; 6612 | + struct scan_rung *rung; 6613 | + 6614 | + for (i = 0; i < SCAN_LADDER_SIZE; i++) { 6615 | + rung = uksm_scan_ladder + i; 6616 | + slot_tree_init_root(&rung->vma_root); 6617 | + } 6618 | + 6619 | + init_performance_values(); 6620 | + uksm_calc_scan_pages(); 6621 | +} 6622 | + 6623 | +static inline int cal_positive_negative_costs(void) 6624 | +{ 6625 | + struct page *p1, *p2; 6626 | + unsigned char *addr1, *addr2; 6627 | + unsigned long i, time_start, hash_cost; 6628 | + unsigned long loopnum = 0; 6629 | + 6630 | + /*IMPORTANT: volatile is needed to prevent over-optimization by gcc. */ 6631 | + volatile u32 hash; 6632 | + volatile int ret; 6633 | + 6634 | + p1 = alloc_page(GFP_KERNEL); 6635 | + if (!p1) 6636 | + return -ENOMEM; 6637 | + 6638 | + p2 = alloc_page(GFP_KERNEL); 6639 | + if (!p2) 6640 | + return -ENOMEM; 6641 | + 6642 | + addr1 = kmap_atomic(p1); 6643 | + addr2 = kmap_atomic(p2); 6644 | + memset(addr1, prandom_u32(), PAGE_SIZE); 6645 | + memcpy(addr2, addr1, PAGE_SIZE); 6646 | + 6647 | + /* make sure that the two pages differ in last byte */ 6648 | + addr2[PAGE_SIZE-1] = ~addr2[PAGE_SIZE-1]; 6649 | + kunmap_atomic(addr2); 6650 | + kunmap_atomic(addr1); 6651 | + 6652 | + time_start = jiffies; 6653 | + while (jiffies - time_start < 100) { 6654 | + for (i = 0; i < 100; i++) 6655 | + hash = page_hash(p1, HASH_STRENGTH_FULL, 0); 6656 | + loopnum += 100; 6657 | + } 6658 | + hash_cost = (jiffies - time_start); 6659 | + 6660 | + time_start = jiffies; 6661 | + for (i = 0; i < loopnum; i++) 6662 | + ret = pages_identical(p1, p2); 6663 | + memcmp_cost = HASH_STRENGTH_FULL * (jiffies - time_start); 6664 | + memcmp_cost /= hash_cost; 6665 | + printk(KERN_INFO "UKSM: relative memcmp_cost = %lu " 6666 | + "hash=%u cmp_ret=%d.\n", 6667 | + memcmp_cost, hash, ret); 6668 | + 6669 | + __free_page(p1); 6670 | + __free_page(p2); 6671 | + return 0; 6672 | +} 6673 | + 6674 | +static int init_zeropage_hash_table(void) 6675 | +{ 6676 | + struct page *page; 6677 | + char *addr; 6678 | + int i; 6679 | + 6680 | + page = alloc_page(GFP_KERNEL); 6681 | + if (!page) 6682 | + return -ENOMEM; 6683 | + 6684 | + addr = kmap_atomic(page); 6685 | + memset(addr, 0, PAGE_SIZE); 6686 | + kunmap_atomic(addr); 6687 | + 6688 | + zero_hash_table = kmalloc(HASH_STRENGTH_MAX * sizeof(u32), 6689 | + GFP_KERNEL); 6690 | + if (!zero_hash_table) 6691 | + return -ENOMEM; 6692 | + 6693 | + for (i = 0; i < HASH_STRENGTH_MAX; i++) 6694 | + zero_hash_table[i] = page_hash(page, i, 0); 6695 | + 6696 | + __free_page(page); 6697 | + 6698 | + return 0; 6699 | +} 6700 | + 6701 | +static inline int init_random_sampling(void) 6702 | +{ 6703 | + unsigned long i; 6704 | + random_nums = kmalloc(PAGE_SIZE, GFP_KERNEL); 6705 | + if (!random_nums) 6706 | + return -ENOMEM; 6707 | + 6708 | + for (i = 0; i < HASH_STRENGTH_FULL; i++) 6709 | + random_nums[i] = i; 6710 | + 6711 | + for (i = 0; i < HASH_STRENGTH_FULL; i++) { 6712 | + unsigned long rand_range, swap_index, tmp; 6713 | + 6714 | + rand_range = HASH_STRENGTH_FULL - i; 6715 | + swap_index = i + prandom_u32() % rand_range; 6716 | + tmp = random_nums[i]; 6717 | + random_nums[i] = random_nums[swap_index]; 6718 | + random_nums[swap_index] = tmp; 6719 | + } 6720 | + 6721 | + rshash_state.state = RSHASH_NEW; 6722 | + rshash_state.below_count = 0; 6723 | + rshash_state.lookup_window_index = 0; 6724 | + 6725 | + return cal_positive_negative_costs(); 6726 | +} 6727 | + 6728 | +static int __init uksm_slab_init(void) 6729 | +{ 6730 | + rmap_item_cache = UKSM_KMEM_CACHE(rmap_item, 0); 6731 | + if (!rmap_item_cache) 6732 | + goto out; 6733 | + 6734 | + stable_node_cache = UKSM_KMEM_CACHE(stable_node, 0); 6735 | + if (!stable_node_cache) 6736 | + goto out_free1; 6737 | + 6738 | + node_vma_cache = UKSM_KMEM_CACHE(node_vma, 0); 6739 | + if (!node_vma_cache) 6740 | + goto out_free2; 6741 | + 6742 | + vma_slot_cache = UKSM_KMEM_CACHE(vma_slot, 0); 6743 | + if (!vma_slot_cache) 6744 | + goto out_free3; 6745 | + 6746 | + tree_node_cache = UKSM_KMEM_CACHE(tree_node, 0); 6747 | + if (!tree_node_cache) 6748 | + goto out_free4; 6749 | + 6750 | + return 0; 6751 | + 6752 | +out_free4: 6753 | + kmem_cache_destroy(vma_slot_cache); 6754 | +out_free3: 6755 | + kmem_cache_destroy(node_vma_cache); 6756 | +out_free2: 6757 | + kmem_cache_destroy(stable_node_cache); 6758 | +out_free1: 6759 | + kmem_cache_destroy(rmap_item_cache); 6760 | +out: 6761 | + return -ENOMEM; 6762 | +} 6763 | + 6764 | +static void __init uksm_slab_free(void) 6765 | +{ 6766 | + kmem_cache_destroy(stable_node_cache); 6767 | + kmem_cache_destroy(rmap_item_cache); 6768 | + kmem_cache_destroy(node_vma_cache); 6769 | + kmem_cache_destroy(vma_slot_cache); 6770 | + kmem_cache_destroy(tree_node_cache); 6771 | +} 6772 | + 6773 | +/* Common interface to ksm, different to it. */ 6774 | +int ksm_madvise(struct vm_area_struct *vma, unsigned long start, 6775 | + unsigned long end, int advice, unsigned long *vm_flags) 6776 | +{ 6777 | + int err; 6778 | + 6779 | + switch (advice) { 6780 | + case MADV_MERGEABLE: 6781 | + return 0; /* just ignore the advice */ 6782 | + 6783 | + case MADV_UNMERGEABLE: 6784 | + if (!(*vm_flags & VM_MERGEABLE)) 6785 | + return 0; /* just ignore the advice */ 6786 | + 6787 | + if (vma->anon_vma) { 6788 | + err = unmerge_uksm_pages(vma, start, end); 6789 | + if (err) 6790 | + return err; 6791 | + } 6792 | + 6793 | + uksm_remove_vma(vma); 6794 | + *vm_flags &= ~VM_MERGEABLE; 6795 | + break; 6796 | + } 6797 | + 6798 | + return 0; 6799 | +} 6800 | + 6801 | +/* Common interface to ksm, actually the same. */ 6802 | +struct page *ksm_might_need_to_copy(struct page *page, 6803 | + struct vm_area_struct *vma, unsigned long address) 6804 | +{ 6805 | + struct anon_vma *anon_vma = page_anon_vma(page); 6806 | + struct page *new_page; 6807 | + 6808 | + if (PageKsm(page)) { 6809 | + if (page_stable_node(page)) 6810 | + return page; /* no need to copy it */ 6811 | + } else if (!anon_vma) { 6812 | + return page; /* no need to copy it */ 6813 | + } else if (anon_vma->root == vma->anon_vma->root && 6814 | + page->index == linear_page_index(vma, address)) { 6815 | + return page; /* still no need to copy it */ 6816 | + } 6817 | + if (!PageUptodate(page)) 6818 | + return page; /* let do_swap_page report the error */ 6819 | + 6820 | + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 6821 | + if (new_page) { 6822 | + copy_user_highpage(new_page, page, address, vma); 6823 | + 6824 | + SetPageDirty(new_page); 6825 | + __SetPageUptodate(new_page); 6826 | + __SetPageLocked(new_page); 6827 | + } 6828 | + 6829 | + return new_page; 6830 | +} 6831 | + 6832 | +static int __init uksm_init(void) 6833 | +{ 6834 | + struct task_struct *uksm_thread; 6835 | + int err; 6836 | + 6837 | + uksm_sleep_jiffies = msecs_to_jiffies(100); 6838 | + uksm_sleep_saved = uksm_sleep_jiffies; 6839 | + 6840 | + slot_tree_init(); 6841 | + init_scan_ladder(); 6842 | + 6843 | + 6844 | + err = init_random_sampling(); 6845 | + if (err) 6846 | + goto out_free2; 6847 | + 6848 | + err = uksm_slab_init(); 6849 | + if (err) 6850 | + goto out_free1; 6851 | + 6852 | + err = init_zeropage_hash_table(); 6853 | + if (err) 6854 | + goto out_free0; 6855 | + 6856 | + uksm_thread = kthread_run(uksm_scan_thread, NULL, "uksmd"); 6857 | + if (IS_ERR(uksm_thread)) { 6858 | + printk(KERN_ERR "uksm: creating kthread failed\n"); 6859 | + err = PTR_ERR(uksm_thread); 6860 | + goto out_free; 6861 | + } 6862 | + 6863 | +#ifdef CONFIG_SYSFS 6864 | + err = sysfs_create_group(mm_kobj, &uksm_attr_group); 6865 | + if (err) { 6866 | + printk(KERN_ERR "uksm: register sysfs failed\n"); 6867 | + kthread_stop(uksm_thread); 6868 | + goto out_free; 6869 | + } 6870 | +#else 6871 | + uksm_run = UKSM_RUN_MERGE; /* no way for user to start it */ 6872 | + 6873 | +#endif /* CONFIG_SYSFS */ 6874 | + 6875 | +#ifdef CONFIG_MEMORY_HOTREMOVE 6876 | + /* 6877 | + * Choose a high priority since the callback takes uksm_thread_mutex: 6878 | + * later callbacks could only be taking locks which nest within that. 6879 | + */ 6880 | + hotplug_memory_notifier(uksm_memory_callback, 100); 6881 | +#endif 6882 | + return 0; 6883 | + 6884 | +out_free: 6885 | + kfree(zero_hash_table); 6886 | +out_free0: 6887 | + uksm_slab_free(); 6888 | +out_free1: 6889 | + kfree(random_nums); 6890 | +out_free2: 6891 | + kfree(uksm_scan_ladder); 6892 | + return err; 6893 | +} 6894 | + 6895 | +#ifdef MODULE 6896 | +subsys_initcall(ksm_init); 6897 | +#else 6898 | +late_initcall(uksm_init); 6899 | +#endif 6900 | + 6901 | diff --git a/mm/vmstat.c b/mm/vmstat.c 6902 | index cb2a67b..912b86f 100644 6903 | --- a/mm/vmstat.c 6904 | +++ b/mm/vmstat.c 6905 | @@ -733,6 +733,9 @@ const char * const vmstat_text[] = { 6906 | "nr_anon_transparent_hugepages", 6907 | "nr_free_cma", 6908 | 6909 | +#ifdef CONFIG_UKSM 6910 | + "nr_uksm_zero_pages", 6911 | +#endif 6912 | /* enum writeback_stat_item counters */ 6913 | "nr_dirty_threshold", 6914 | "nr_dirty_background_threshold", 6915 | --------------------------------------------------------------------------------