├── 0001-Huge-page-allocation-hook.patch ├── 0002-Add-kernel-hooks.patch ├── Makefile ├── README.md └── jc.c /0001-Huge-page-allocation-hook.patch: -------------------------------------------------------------------------------- 1 | From ff726a38bac8d0cc7a1c2934aaedb00666c204f3 Mon Sep 17 00:00:00 2001 2 | From: Baptiste Lepers 3 | Date: Wed, 25 May 2022 06:51:45 +0000 4 | Subject: [PATCH 1/1] Huge page allocation hook 5 | 6 | --- 7 | include/linux/hugetlb.h | 4 ++++ 8 | mm/hugetlb.c | 10 ++++++++++ 9 | tools/perf/Makefile.perf | 3 ++- 10 | 3 files changed, 16 insertions(+), 1 deletion(-) 11 | 12 | diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h 13 | index 1faebe1cd0ed..dae593460e2c 100644 14 | --- a/include/linux/hugetlb.h 15 | +++ b/include/linux/hugetlb.h 16 | @@ -1103,4 +1103,8 @@ bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr); 17 | #define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) 18 | #endif 19 | 20 | + 21 | +typedef struct page *(*dequeue_hook_t)(struct hstate *h, int nid); 22 | +void set_dequeue_hook(dequeue_hook_t hook); 23 | + 24 | #endif /* _LINUX_HUGETLB_H */ 25 | diff --git a/mm/hugetlb.c b/mm/hugetlb.c 26 | index 95dc7b83381f..16f06224a9b9 100644 27 | --- a/mm/hugetlb.c 28 | +++ b/mm/hugetlb.c 29 | @@ -1080,11 +1080,21 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) 30 | SetHPageFreed(page); 31 | } 32 | 33 | +static dequeue_hook_t dequeue_hook = NULL; 34 | +void set_dequeue_hook(dequeue_hook_t hook) 35 | +{ 36 | + dequeue_hook = hook; 37 | +} 38 | +EXPORT_SYMBOL(set_dequeue_hook); 39 | + 40 | static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) 41 | { 42 | struct page *page; 43 | bool pin = !!(current->flags & PF_MEMALLOC_PIN); 44 | 45 | + if(dequeue_hook) 46 | + return dequeue_hook(h, nid); 47 | + 48 | lockdep_assert_held(&hugetlb_lock); 49 | list_for_each_entry(page, &h->hugepage_freelists[nid], lru) { 50 | if (pin && !is_pinnable_page(page)) 51 | diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf 52 | index e04313c4d840..cd3641e15343 100644 53 | --- a/tools/perf/Makefile.perf 54 | +++ b/tools/perf/Makefile.perf 55 | @@ -51,7 +51,8 @@ include ../scripts/utilities.mak 56 | # Define GTK2 if you want GTK+ GUI support. 57 | # 58 | # Define NO_DEMANGLE if you do not want C++ symbol demangling. 59 | -# 60 | +# 61 | +CFLAGS += -DNO_DEMANGLE 62 | # Define NO_LIBELF if you do not want libelf dependency (e.g. cross-builds) 63 | # 64 | # Define NO_LIBUNWIND if you do not want libunwind dependency for dwarf 65 | -- 66 | 2.25.1 67 | 68 | -------------------------------------------------------------------------------- /0002-Add-kernel-hooks.patch: -------------------------------------------------------------------------------- 1 | From deb09ca95fe3067a67595a1e4fe56a6f6f7da6d1 Mon Sep 17 00:00:00 2001 2 | From: Baptiste Lepers 3 | Date: Wed, 27 Jul 2022 00:58:32 +0000 4 | Subject: [PATCH 2/2] Add kernel hooks 5 | 6 | --- 7 | include/linux/hugetlb.h | 8 ++++++++ 8 | kernel/events/core.c | 2 ++ 9 | mm/hugetlb.c | 27 +++++++++++++++++++++++++++ 10 | mm/migrate.c | 1 + 11 | mm/vmscan.c | 1 + 12 | 5 files changed, 39 insertions(+) 13 | 14 | diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h 15 | index dae593460e2c..298f99550203 100644 16 | --- a/include/linux/hugetlb.h 17 | +++ b/include/linux/hugetlb.h 18 | @@ -115,6 +115,8 @@ extern struct resv_map *resv_map_alloc(void); 19 | void resv_map_release(struct kref *ref); 20 | 21 | extern spinlock_t hugetlb_lock; 22 | +spinlock_t *get_hugetlb_lock(void); 23 | + 24 | extern int hugetlb_max_hstate __read_mostly; 25 | #define for_each_hstate(h) \ 26 | for ((h) = hstates; (h) < &hstates[hugetlb_max_hstate]; (h)++) 27 | @@ -1107,4 +1109,10 @@ bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr); 28 | typedef struct page *(*dequeue_hook_t)(struct hstate *h, int nid); 29 | void set_dequeue_hook(dequeue_hook_t hook); 30 | 31 | +typedef struct page *(*enqueue_hook_t)(struct hstate *h, struct page *page); 32 | +void set_enqueue_hook(enqueue_hook_t hook); 33 | + 34 | +int get_max_hstates(void); 35 | +struct hstate *get_hstates(void); 36 | + 37 | #endif /* _LINUX_HUGETLB_H */ 38 | diff --git a/kernel/events/core.c b/kernel/events/core.c 39 | index 0c000cb01eeb..a2d96352f59d 100644 40 | --- a/kernel/events/core.c 41 | +++ b/kernel/events/core.c 42 | @@ -7441,6 +7441,8 @@ void perf_prepare_sample(struct perf_event_header *header, 43 | */ 44 | WARN_ON_ONCE(header->size & 7); 45 | } 46 | +EXPORT_SYMBOL_GPL(perf_prepare_sample); 47 | + 48 | 49 | static __always_inline int 50 | __perf_event_output(struct perf_event *event, 51 | diff --git a/mm/hugetlb.c b/mm/hugetlb.c 52 | index 16f06224a9b9..c7cacf19a769 100644 53 | --- a/mm/hugetlb.c 54 | +++ b/mm/hugetlb.c 55 | @@ -72,6 +72,10 @@ static bool __initdata parsed_default_hugepagesz; 56 | * free_huge_pages, and surplus_huge_pages. 57 | */ 58 | DEFINE_SPINLOCK(hugetlb_lock); 59 | +spinlock_t *get_hugetlb_lock(void) { 60 | + return &hugetlb_lock; 61 | +} 62 | +EXPORT_SYMBOL(get_hugetlb_lock); 63 | 64 | /* 65 | * Serializes faults on the same logical page. This is used to 66 | @@ -1067,10 +1071,22 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg) 67 | return false; 68 | } 69 | 70 | +static enqueue_hook_t enqueue_hook = NULL; 71 | +void set_enqueue_hook(enqueue_hook_t hook) 72 | +{ 73 | + enqueue_hook = hook; 74 | +} 75 | +EXPORT_SYMBOL(set_enqueue_hook); 76 | + 77 | static void enqueue_huge_page(struct hstate *h, struct page *page) 78 | { 79 | int nid = page_to_nid(page); 80 | 81 | + if(enqueue_hook) { 82 | + enqueue_hook(h, page); 83 | + return; 84 | + } 85 | + 86 | lockdep_assert_held(&hugetlb_lock); 87 | VM_BUG_ON_PAGE(page_count(page), page); 88 | 89 | @@ -1078,6 +1094,7 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) 90 | h->free_huge_pages++; 91 | h->free_huge_pages_node[nid]++; 92 | SetHPageFreed(page); 93 | + 94 | } 95 | 96 | static dequeue_hook_t dequeue_hook = NULL; 97 | @@ -1087,6 +1104,16 @@ void set_dequeue_hook(dequeue_hook_t hook) 98 | } 99 | EXPORT_SYMBOL(set_dequeue_hook); 100 | 101 | +int get_max_hstates(void) { 102 | + return hugetlb_max_hstate; 103 | +} 104 | +EXPORT_SYMBOL(get_max_hstates); 105 | + 106 | +struct hstate *get_hstates(void) { 107 | + return &hstates[0]; 108 | +} 109 | +EXPORT_SYMBOL(get_hstates); 110 | + 111 | static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) 112 | { 113 | struct page *page; 114 | diff --git a/mm/migrate.c b/mm/migrate.c 115 | index a6a7743ee98f..eb80305e5fc7 100644 116 | --- a/mm/migrate.c 117 | +++ b/mm/migrate.c 118 | @@ -1601,6 +1601,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, 119 | 120 | return rc; 121 | } 122 | +EXPORT_SYMBOL(migrate_pages); 123 | 124 | struct page *alloc_migration_target(struct page *page, unsigned long private) 125 | { 126 | diff --git a/mm/vmscan.c b/mm/vmscan.c 127 | index 74296c2d1fed..69ae1cda8dfb 100644 128 | --- a/mm/vmscan.c 129 | +++ b/mm/vmscan.c 130 | @@ -2107,6 +2107,7 @@ int isolate_lru_page(struct page *page) 131 | 132 | return ret; 133 | } 134 | +EXPORT_SYMBOL(isolate_lru_page); 135 | 136 | /* 137 | * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and 138 | -- 139 | 2.25.1 140 | 141 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | obj-m += jc.o 2 | 3 | all: 4 | make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules 5 | 6 | clean: 7 | make -C /lib/modules/$(shell uname -r)/build M=$(PWD) clean 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Johnny Cache: the End of DRAM Cache Conflicts (in Tiered Main Memory Systems) 2 | 3 | Paper: [https://www.usenix.org/conference/osdi23/presentation/lepers](https://www.usenix.org/conference/osdi23/presentation/lepers) 4 | 5 | ## Usage 6 | Patch your kernel using the provided patches. 7 | 8 | Edit jc.c to configure JC (see comment on top of the file). I recommend to first try the module without the dynamic policy (default setting). 9 | 10 | JC only handles 2MB pages. To force an application to allocate huge pages, we use TCMALLOC ([https://github.com/gperftools/gperftools](https://github.com/gperftools/gperftools)). 11 | 12 | ```bash 13 | # Reserve a few large pages 14 | sudo sh -c "echo 174080 > /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages" # 340GB of large pages 15 | sudo mount -t hugetlbfs none /mnt/hugetlbfs 16 | sudo chmod 777 /mnt/hugetlbfs/ 17 | 18 | # Insert the module and launch BC with 20 threads 19 | sudo dmesg -c; # clear dmesg 20 | make; # READ the comment of jc.c BEFORE compiling! 21 | sudo insmod jc.ko; 22 | 23 | numactl --cpunodebind 0 env OMP_NUM_THREADS=20 TCMALLOC_MEMFS_MALLOC_PATH=/mnt/hugetlbfs/ LD_PRELOAD=~/gperftools/.libs/libtcmalloc.so ~/hemem/apps/gapbs/bc -u 29; 24 | 25 | sudo rmmod jc; 26 | sudo dmesg -c; # print stats 27 | ``` 28 | 29 | -------------------------------------------------------------------------------- /jc.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Johnny Cache 3 | * 4 | * Logic of the code: 5 | * 6 | * 7 | * The static policy is simple, it uses 3 functions: 8 | * build_page_list(); // Builds the list of available pages 9 | * set_dequeue_hook(minimize_conflicts); // Called when allocating a page 10 | * set_enqueue_hook(enqueue_freed_page); // Called when freeing a page 11 | * 12 | * minimize_conflict calls reserve_page that increases the heat of a bucket (atomic_add...) 13 | * enqueue_freed_page calls decreases the heat of the bucket of the freed page 14 | * 15 | * Both functions call move_pages_contention() to maintain the list of pages ordered by heat 16 | * 17 | * None of these functions are efficient, but they do the job. 18 | * 19 | * 20 | * 21 | * For the dynamic policy, we tried many implementations. None really worked (we have a paragraph 22 | * discussing that in the paper). The current implementation is the following: 23 | * ENABLE_SAMPLING and ENABLE_PERIODIC_MIGRATIONS need to be set to 1 24 | * 25 | * In init_module, we set up the sampling. Whenever a memory access is found, pebs_sample is called. 26 | * pebs_sample calls add_sample that increases the heat of pages & cache slots. 27 | * 28 | * Currently the implementation relies on a ring buffer that stores the last N samples. When a page 29 | * is sampled, its heat increases and its address is placed in the ring. While doing so, the sampled 30 | * page may replace another page, whose heat will be decreased. So basically we maintain a "moving 31 | * average" of the heat of pages, over the last N samples. 32 | * 33 | * Periodically, a timer is triggered. It calls periodic_migrations() that migrates a page in case of conflict. 34 | * 35 | * 36 | * AGAIN, keep in mind that the dynamic migrations DO NOT WORK WELL. As we describe in the paper, configuring 37 | * dynamic migrations is very finicky. You'll need to adjust the sampling rate, etc. to get it to work. 38 | * 39 | * 40 | * 41 | * /!\ IMPORTANT /!\ 42 | * Don't forget to set CACHE_SIZE to the size of your DRAM cache. 43 | * Experiments were done on a machine with 128GB of RAM (64GB per NUMA node), so our cache is 48GB (per NUMA node). 44 | * 45 | */ 46 | #include /* Needed by all modules */ 47 | #include 48 | #include /* Needed for KERN_INFO */ 49 | #include 50 | #include 51 | #include 52 | #include 53 | #include 54 | #include 55 | #include 56 | #include 57 | 58 | uint64_t total_samples = 0; 59 | uint64_t total_samples_found = 0; 60 | uint64_t *pfns = NULL; 61 | size_t pfn_idx = 0; 62 | 63 | #define ENABLE_SAMPLING 0 /* Activate dynamic conflict avoidance? */ 64 | #define ENABLE_PERIODIC_MIGRATIONS 0 /* Should also be 1 for the dynamic policy to work */ 65 | 66 | 67 | /* 68 | * Some helper functions to manipulate pages & bitmasks, copy/pasted from the kernel. 69 | */ 70 | /* Return a pointer to the bitmap storing bits affecting a block of pages */ 71 | static inline unsigned long *get_pageblock_bitmap(const struct page *page, 72 | unsigned long pfn) 73 | { 74 | #ifdef CONFIG_SPARSEMEM 75 | return section_to_usemap(__pfn_to_section(pfn)); 76 | #else 77 | return page_zone(page)->pageblock_flags; 78 | #endif /* CONFIG_SPARSEMEM */ 79 | } 80 | 81 | static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn) 82 | { 83 | #ifdef CONFIG_SPARSEMEM 84 | pfn &= (PAGES_PER_SECTION-1); 85 | #else 86 | pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages); 87 | #endif /* CONFIG_SPARSEMEM */ 88 | return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 89 | } 90 | 91 | static __always_inline 92 | unsigned long __get_pfnblock_flags_mask(const struct page *page, 93 | unsigned long pfn, 94 | unsigned long mask) 95 | { 96 | unsigned long *bitmap; 97 | unsigned long bitidx, word_bitidx; 98 | unsigned long word; 99 | 100 | bitmap = get_pageblock_bitmap(page, pfn); 101 | bitidx = pfn_to_bitidx(page, pfn); 102 | word_bitidx = bitidx / BITS_PER_LONG; 103 | bitidx &= (BITS_PER_LONG-1); 104 | 105 | word = bitmap[word_bitidx]; 106 | return (word >> bitidx) & mask; 107 | } 108 | 109 | unsigned long get_pfnblock_flags_mask(const struct page *page, 110 | unsigned long pfn, unsigned long mask) 111 | { 112 | return __get_pfnblock_flags_mask(page, pfn, mask); 113 | } 114 | 115 | struct migration_target_control { 116 | pid_t pid; 117 | }; 118 | 119 | 120 | /* 121 | * Actual code of JC 122 | */ 123 | static int max_conflicts = 0; // module parameter, maximize conflicts or minize them? 124 | #define CACHE_SIZE (48*512) // DRAM is 64GB -> cache is 48GB on our machine -> 48*512 2MB pages 125 | #define RING_SIZE (1000000) // Compute stats on the last XX memory access samples 126 | #define CONTENTION_LEVELS 20 // Number of contention levels (to be in level N, a page must have been accessed heat_to_contention_level(N) times) 127 | #define PROBLEMATIC_CONTENTION 10 // If a page has been accessed more than that, it is hot 128 | #define MAX_MIGRATED_PAGES 100 // Max number of migrations per period 129 | 130 | #define LARGE_PAGE_SIZE (2LU*1024*1024) // 2MB 131 | #define pfn_to_large_pfn(pfn) ((pfn)/(512)) // 512 4KB pages in a 2MB page 132 | #define pfn_to_bucket(pfn) (pfn_to_large_pfn(pfn) % CACHE_SIZE) 133 | 134 | #define HEAT_PRINT_SIZE 2048 135 | static char *printbuf; 136 | static int total_allocated_pages = 0; 137 | 138 | /* How hot is a cache region? */ 139 | static atomic_t heatmap[CACHE_SIZE]; 140 | 141 | /* Ring buffer of samples */ 142 | static struct ring { 143 | atomic_t idx; 144 | struct sample { 145 | int weight; 146 | u64 pfn; 147 | } samples[RING_SIZE]; 148 | } ring; 149 | 150 | /* Metadata to help page allocations */ 151 | struct slot_list { 152 | size_t bucket; 153 | struct page *page; 154 | int heat; 155 | size_t used_by; // tgid 156 | size_t virt_addr; 157 | struct list_head contention_list; 158 | struct list_head bin_list; 159 | }; 160 | static struct list_head unused_pages[CONTENTION_LEVELS]; // heat -> pages 161 | static struct list_head allocated_pages[CONTENTION_LEVELS]; // heat -> pages 162 | static struct list_head heated_pages[CONTENTION_LEVELS]; // heat -> pages 163 | static struct list_head cache_bins[CACHE_SIZE]; // cache slot -> pages 164 | static spinlock_t lock; // list_heads are not thread safe... 165 | 166 | /* Metadata to help page migration */ 167 | struct pid_pages { 168 | struct pid_pages *next; 169 | int nb_pages; 170 | int nb_max_pages; 171 | pid_t tgid; 172 | unsigned long *addresses; 173 | struct page **pages; 174 | }; 175 | static struct pages_container { 176 | int nb_pid; 177 | struct pid_pages *pids; 178 | } pages_to_migrate; 179 | 180 | 181 | /* PEBS configuration */ 182 | #define PEBS_SAMPLE_TYPE PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR 183 | #define PMEM_READ 0x80d1 184 | #define DRAM_READ 0x20d1 185 | #define STORE_ALL 0x82d0 186 | static struct perf_event **events; 187 | static size_t configs[] = { DRAM_READ, PMEM_READ, STORE_ALL }; 188 | //static size_t configs[] = { PMEM_READ }; 189 | 190 | /* Timer */ 191 | struct task_struct *kthread; 192 | 193 | 194 | 195 | 196 | 197 | /* 198 | * Maintain heatmap 199 | * Maintain list of allocated and free pages at the correct contention level. 200 | * The current implementation maintains a ring buffer of samples, and updates the heatmap. 201 | * 202 | * E.g., 203 | * ring = [ pageX, pageY (current idx), pageZ ] 204 | * When receiving a new sample, say on pageW, then 205 | * ring = [ pageX, pageY, pageW (current idx) ] 206 | * and the heat of pageZ is decreased and the heat of pageW is increased. 207 | * The functions also update the heat of the cache slots. 208 | */ 209 | static size_t heat_to_contention_level(int heat) { 210 | /* Various definitions of contention */ 211 | size_t idx = 0; 212 | //while(heat) { 213 | //heat /= 2; 214 | //idx++; 215 | //} 216 | idx = heat / 50; 217 | if(idx >= CONTENTION_LEVELS) 218 | idx = CONTENTION_LEVELS - 1; 219 | return idx; 220 | } 221 | 222 | /* Make sure all pages are in the correct contention list so that we can choose one from the lowest */ 223 | static void move_pages_contention(size_t bucket, int heat) { 224 | struct slot_list *s; 225 | size_t idx = heat_to_contention_level(heat); 226 | 227 | spin_lock(&lock); 228 | list_for_each_entry(s, &cache_bins[bucket], bin_list) { 229 | if(s->used_by && s->heat) 230 | list_move(&s->contention_list, &heated_pages[idx]); 231 | else if(s->used_by) 232 | list_move(&s->contention_list, &allocated_pages[idx]); 233 | else 234 | list_move(&s->contention_list, &unused_pages[idx]); 235 | } 236 | spin_unlock(&lock); 237 | } 238 | 239 | /* add_sample is a generic function to increaset the head of a page */ 240 | static void add_sample(u64 pfn, int weight) { 241 | size_t bucket = pfn_to_bucket(pfn); 242 | int idx = atomic_inc_return(&ring.idx) - 1; 243 | 244 | //return; 245 | 246 | if(idx >= RING_SIZE) { 247 | int old_weight = ring.samples[idx % RING_SIZE].weight; 248 | int old_pfn = ring.samples[idx % RING_SIZE].pfn; 249 | size_t old_bucket = pfn_to_bucket(old_pfn); 250 | atomic_sub(old_weight, &heatmap[old_bucket]); 251 | move_pages_contention(old_bucket, atomic_read(&heatmap[old_bucket])); 252 | } 253 | 254 | ring.samples[idx % RING_SIZE].pfn = pfn; 255 | ring.samples[idx % RING_SIZE].weight = weight; 256 | atomic_add(weight, &heatmap[bucket]); 257 | move_pages_contention(bucket, atomic_read(&heatmap[bucket])); 258 | 259 | { 260 | struct slot_list *s, *found = NULL; 261 | size_t bucket = pfn_to_bucket(pfn); 262 | spin_lock(&lock); 263 | list_for_each_entry(s, &cache_bins[bucket], bin_list) { 264 | if(pfn_to_large_pfn(page_to_pfn(s->page)) == pfn_to_large_pfn(pfn)) { 265 | found = s; 266 | found->heat += weight; 267 | list_move(&found->contention_list, &heated_pages[heat_to_contention_level(atomic_read(&heatmap[bucket]))]); 268 | break; 269 | } 270 | } 271 | spin_unlock(&lock); 272 | if(found) { 273 | total_samples_found++; 274 | } else if(total_samples % 10000 == 0) { 275 | printk("Didn't find pfn %lu at GB %lu\n", (long unsigned)pfn, (long unsigned)pfn*4096/1024/1024/1024/100); 276 | } 277 | 278 | total_samples++; 279 | } 280 | } 281 | 282 | /* 283 | * At initialization time, add a new discovered 2MB page 284 | */ 285 | static int add_new_page(struct page *page) { 286 | size_t bucket = pfn_to_bucket(page_to_pfn(page)); 287 | size_t idx = heat_to_contention_level(atomic_read(&heatmap[bucket])); 288 | struct slot_list *s = kmalloc(sizeof(*s), GFP_KERNEL); 289 | if(!s) { 290 | printk(KERN_INFO "Fail to allocate all the pages!\n"); 291 | return -1; 292 | } 293 | 294 | s->bucket = bucket; 295 | s->page = page; 296 | s->used_by = 0; 297 | s->heat = 0; 298 | INIT_LIST_HEAD(&s->bin_list); 299 | INIT_LIST_HEAD(&s->contention_list); 300 | list_add(&s->bin_list, &cache_bins[bucket]); 301 | list_add(&s->contention_list, &unused_pages[idx]); 302 | return 0; 303 | } 304 | 305 | /* 306 | * After munmap, re-add the page in the free list 307 | */ 308 | static int add_freed_page(struct page *page) { 309 | struct slot_list *s; 310 | size_t bucket = pfn_to_bucket(page_to_pfn(page)); 311 | 312 | atomic_sub(50, &heatmap[bucket]); 313 | move_pages_contention(bucket, atomic_read(&heatmap[bucket])); 314 | 315 | { 316 | int heat = atomic_read(&heatmap[bucket]); 317 | int found = 0; 318 | size_t idx = heat_to_contention_level(heat); 319 | 320 | spin_lock(&lock); 321 | list_for_each_entry(s, &cache_bins[bucket], bin_list) { 322 | if(s->page == page) { 323 | s->used_by = 0; 324 | list_move(&s->contention_list, &unused_pages[idx]); 325 | found = 1; 326 | //printk("Freeing page %lu %lx\n", pfn_to_large_pfn(page_to_pfn(page)), s->virt_addr); 327 | break; 328 | } 329 | } 330 | spin_unlock(&lock); 331 | 332 | if(!found) 333 | printk("Didn't find page %lu\n", pfn_to_large_pfn(page_to_pfn(page))); 334 | } 335 | return 0; 336 | } 337 | 338 | /* 339 | * Mark a 2MB page as used (done just after mmap) 340 | */ 341 | static void reserve_page(struct hstate *h, int nid, pid_t pid, struct slot_list *s, int contention_idx, struct vm_area_struct *vma, unsigned long addr) 342 | { 343 | size_t bucket = pfn_to_bucket(page_to_pfn(s->page)); 344 | 345 | spin_lock(&lock); 346 | s->used_by = pid; 347 | s->heat = 0; 348 | s->virt_addr = addr; 349 | list_move(&s->contention_list, &allocated_pages[contention_idx]); 350 | spin_unlock(&lock); 351 | 352 | atomic_add(50, &heatmap[bucket]); 353 | move_pages_contention(bucket, atomic_read(&heatmap[bucket])); 354 | 355 | list_move(&s->page->lru, &h->hugepage_activelist); 356 | set_page_count(s->page, 1); 357 | ClearHPageFreed(s->page); 358 | h->free_huge_pages--; 359 | h->free_huge_pages_node[nid]--; 360 | 361 | total_allocated_pages++; 362 | //printk("Allocating page %lu heat %d here %lu %lx\n", pfn_to_large_pfn(page_to_pfn(s->page)), atomic_read(&heatmap[bucket]), addr, addr); 363 | } 364 | 365 | struct hstate *size_to_hstate(unsigned long size) 366 | { 367 | struct hstate *h; 368 | struct hstate *hstates = get_hstates(); 369 | int hugetlb_max_hstate = get_max_hstates(); 370 | 371 | for_each_hstate(h) { 372 | if (huge_page_size(h) == size) 373 | return h; 374 | } 375 | return NULL; 376 | } 377 | 378 | /* 379 | * Function called at initialization time to build the list of 2MB pages 380 | */ 381 | static void build_page_list(void) { 382 | size_t i, nb_pages = 0; 383 | int nid = 0; // TODO! 384 | struct hstate *h; 385 | struct page *page; 386 | bool pin = !!(current->flags & PF_MEMALLOC_PIN); 387 | struct hstate *hstates = get_hstates(); 388 | int hugetlb_max_hstate = get_max_hstates(); 389 | 390 | for(i = 0; i < CONTENTION_LEVELS; i++) 391 | INIT_LIST_HEAD(&unused_pages[i]); 392 | for(i = 0; i < CONTENTION_LEVELS; i++) 393 | INIT_LIST_HEAD(&allocated_pages[i]); 394 | for(i = 0; i < CONTENTION_LEVELS; i++) 395 | INIT_LIST_HEAD(&heated_pages[i]); 396 | for(i = 0; i < CACHE_SIZE; i++) 397 | INIT_LIST_HEAD(&cache_bins[i]); 398 | 399 | // Pages have 0 refcount here -- but let's not increase it to not interfere with the kernel 400 | // They are not going to disappear anyway, except if the user reduces the number of 401 | // hugetlbfs pages... 402 | for_each_hstate(h) { 403 | list_for_each_entry(page, &h->hugepage_freelists[nid], lru) { 404 | if (pin && !is_pinnable_page(page)) 405 | continue; 406 | 407 | if (PageHWPoison(page)) 408 | continue; 409 | 410 | nb_pages++; 411 | if(add_new_page(page) != 0) 412 | break; 413 | } 414 | } 415 | printk("Successfully created a list of %lu pages\n", nb_pages); 416 | /*{ 417 | int i; 418 | struct slot_list *s; 419 | for(i = 0; i < CACHE_SIZE; i++) { 420 | int nb_entries = 0, nb_allocated = 0; 421 | list_for_each_entry(s, &cache_bins[i], bin_list) { 422 | nb_entries++; 423 | if(s->used_by) 424 | nb_allocated++; 425 | } 426 | printk("CHECK: cache_bins[%d] = %d entries %d allocated %d heat\n", i, nb_entries, nb_allocated, atomic_read(&heatmap[i])); 427 | } 428 | }*/ 429 | } 430 | 431 | static __attribute__((unused)) void check_list(void) { 432 | size_t i; 433 | struct slot_list *s; 434 | for(i = 0; i < CONTENTION_LEVELS; i++) { 435 | int nb_entries = 0; 436 | list_for_each_entry(s, &unused_pages[i], contention_list) { 437 | nb_entries++; 438 | } 439 | printk("CHECK: contention[%lu] = %d entries\n", i, nb_entries); 440 | } 441 | return; 442 | for(i = 0; i < CACHE_SIZE; i++) { 443 | int nb_entries = 0, nb_allocated = 0; 444 | list_for_each_entry(s, &cache_bins[i], bin_list) { 445 | nb_entries++; 446 | if(s->used_by) 447 | nb_allocated++; 448 | } 449 | printk("CHECK: cache_bins[%lu] = %d entries %d allocated %d heat\n", i, nb_entries, nb_allocated, atomic_read(&heatmap[i])); 450 | } 451 | } 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | /* 460 | * Page choice algorithm 461 | */ 462 | static struct page *_minimize_conflicts(struct hstate *h, int nid, pid_t pid, struct vm_area_struct *vma, unsigned long addr) 463 | { 464 | size_t i; 465 | struct page *ret = NULL; 466 | struct slot_list *s = NULL; 467 | lockdep_assert_held(&hugetlb_lock); 468 | 469 | for(i = 0; i < CONTENTION_LEVELS; i++) { 470 | //for(i = CONTENTION_LEVELS - 1; i >= 0; i--) { 471 | s = list_first_entry_or_null(&unused_pages[i], struct slot_list, contention_list); 472 | /*size_t lowest_pfn = 0; 473 | list_for_each_entry(sl, &unused_pages[i], contention_list) { 474 | if(lowest_pfn == 0 || page_to_pfn(sl->page) < lowest_pfn) { 475 | s = sl; 476 | lowest_pfn = page_to_pfn(sl->page); 477 | } 478 | }*/ 479 | if(s) 480 | break; 481 | } 482 | 483 | if(s) { 484 | ret = s->page; 485 | reserve_page(h, nid, pid, s, i, vma, addr); 486 | } 487 | return ret; 488 | } 489 | 490 | static struct page *minimize_conflicts(struct hstate *h, int nid, struct vm_area_struct *vma, unsigned long addr) { 491 | return _minimize_conflicts(h, nid, current->tgid, vma, addr); 492 | } 493 | 494 | static struct page *enqueue_freed_page(struct hstate *h, struct page *page) { 495 | int nid = page_to_nid(page); 496 | 497 | lockdep_assert_held(&hugetlb_lock); 498 | VM_BUG_ON_PAGE(page_count(page), page); 499 | 500 | list_move(&page->lru, &h->hugepage_freelists[nid]); 501 | h->free_huge_pages++; 502 | h->free_huge_pages_node[nid]++; 503 | SetHPageFreed(page); 504 | 505 | add_freed_page(page); 506 | return page; 507 | } 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | /* 518 | * PEBS 519 | */ 520 | static u64 perf_virt_to_phys(u64 virt) 521 | { 522 | u64 phys_addr = 0; 523 | 524 | if (!virt) 525 | return 0; 526 | 527 | if (virt >= TASK_SIZE) { 528 | if (virt_addr_valid((void *)(uintptr_t)virt) && 529 | !(virt >= VMALLOC_START && virt < VMALLOC_END)) 530 | phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt); 531 | } else { 532 | if (current->mm != NULL) { 533 | struct page *p; 534 | pagefault_disable(); 535 | if (get_user_page_fast_only(virt, 0, &p)) { 536 | phys_addr = (page_to_pfn(p) << PAGE_SHIFT) + virt % PAGE_SIZE; 537 | put_page(p); 538 | } 539 | pagefault_enable(); 540 | } 541 | } 542 | return phys_addr; 543 | } 544 | 545 | static void pebs_sample(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) 546 | { 547 | size_t phys = perf_virt_to_phys(data->addr); 548 | size_t pfn = phys / 4096; 549 | if(phys && pfn) {} 550 | add_sample(pfn, (event->attr.config == STORE_ALL)?10:1); // TODO 551 | //printk("Event %p (config %llx) CPU %u vs %u Tid %u Virt addr: %llx Phys %llx\n", event, event->attr.config, data->cpu_entry.cpu, smp_processor_id(), current->pid, data->addr, perf_virt_to_phys(data->addr)); 552 | } 553 | 554 | 555 | 556 | 557 | 558 | /* 559 | * Periodic migrations 560 | * 1/ Find pages with high contention 561 | * 2/ Place these pages in a per pid container 562 | * 3/ Migrate pages per pid using the migrate_page kernel function 563 | */ 564 | struct pid_pages *insert_pid_in_container(struct pages_container *c, pid_t tgid) { 565 | struct pid_pages *p = kmalloc(sizeof(*p), GFP_KERNEL); 566 | memset(p, 0, sizeof(*p)); 567 | p->tgid = tgid; 568 | p->next = c->pids; 569 | c->pids = p; 570 | c->nb_pid++; 571 | return p; 572 | } 573 | 574 | void insert_page_in_container(struct pages_container *c, pid_t tgid, unsigned long addr, struct page *page) { 575 | struct pid_pages *p = NULL; 576 | struct pid_pages *l = c->pids; 577 | while(l) { 578 | if(l->tgid == tgid) { 579 | p = l; 580 | break; 581 | } 582 | l = l->next; 583 | } 584 | 585 | if(!p) 586 | p = insert_pid_in_container(c, tgid); 587 | 588 | if(p->nb_pages >= p->nb_max_pages) { 589 | if(p->nb_max_pages) { 590 | p->nb_max_pages *= 2; 591 | } else { 592 | p->nb_max_pages = 256; 593 | } 594 | p->addresses = krealloc(p->addresses, sizeof(*p->addresses)*p->nb_max_pages, GFP_KERNEL); 595 | p->pages = krealloc(p->pages, sizeof(*p->pages)*p->nb_max_pages, GFP_KERNEL); 596 | } 597 | p->addresses[p->nb_pages] = addr; 598 | p->pages[p->nb_pages] = page; 599 | p->nb_pages++; 600 | } 601 | 602 | 603 | static void clean_container(struct pages_container *c) { 604 | struct pid_pages *p, *tmp; 605 | for(p = c->pids; p;) { 606 | if(p->pages) 607 | kfree(p->pages); 608 | if(p->addresses) 609 | kfree(p->addresses); 610 | tmp = p; 611 | p = p->next; 612 | kfree(tmp); 613 | } 614 | c->pids = 0; 615 | c->nb_pid = 0; 616 | } 617 | 618 | 619 | struct task_struct *_find_task_by_vpid(pid_t vnr) 620 | { 621 | return pid_task(find_pid_ns(vnr, task_active_pid_ns(current)), PIDTYPE_PID); 622 | } 623 | 624 | struct page *alloc_migration_target(struct page *old, unsigned long private) { 625 | struct hstate *h = page_hstate(old); 626 | struct migration_target_control *mtc = (void*)private; 627 | return _minimize_conflicts(h, 0, mtc->pid, NULL, (unsigned long)page_address(old)); 628 | } 629 | 630 | /* Equivalent of the move_pages syscall, except it doesn't migrate pages between NUMA nodes but within the same node */ 631 | /* TODO: optimize. We might not need this whole VMA and MM overhead since we already have the struct page... */ 632 | /* ... but at least we know that the following code is safe. pages are refcounted so they are not going to be freed under us */ 633 | int s_migrate_pages(pid_t pid, unsigned long nr_pages, unsigned long *addresses, struct page **pages) { 634 | int nb_migrations = 0, i; 635 | struct task_struct *task; 636 | struct mm_struct *mm; 637 | LIST_HEAD(pagelist); 638 | struct migration_target_control mtc = { 639 | .pid = pid 640 | }; 641 | 642 | rcu_read_lock(); 643 | task = _find_task_by_vpid(pid); 644 | if (!task) { 645 | rcu_read_unlock(); 646 | return -1; 647 | } 648 | rcu_read_unlock(); 649 | 650 | get_task_struct(task); 651 | mm = get_task_mm(task); 652 | put_task_struct(task); 653 | 654 | if (!mm) 655 | return -2; 656 | 657 | mmap_read_lock(mm); 658 | for(i = 0; i < nr_pages; i++) { 659 | //struct page *page = pages[i]; // unsafe? 660 | struct page *page; 661 | struct vm_area_struct *vma = find_vma(mm, addresses[i]); 662 | if(!vma) { 663 | printk("Couldn't find VMA for page %lx\n", addresses[i]); 664 | continue; 665 | } 666 | 667 | page = follow_page(vma, addresses[i], FOLL_GET | FOLL_DUMP); // increment page_ref_count 668 | if(!page) // refcount should be == 2 here, 1 because the page is mapped, 1 because follow_page increments it, unless page got freed 669 | continue; 670 | 671 | isolate_huge_page(page, &pagelist); // also increment page_ref_count, refcount == 3 672 | put_page(page); // decrement the refcount, should be == 2 now 673 | 674 | nb_migrations++; 675 | } 676 | mmap_read_unlock(mm); 677 | 678 | migrate_pages(&pagelist, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, MR_NUMA_MISPLACED, NULL); // will decrement all refcounts 679 | 680 | mmput(mm); 681 | return nb_migrations; 682 | } 683 | 684 | static int periodic_migrations(void* data) 685 | { 686 | if(!ENABLE_PERIODIC_MIGRATIONS) 687 | return 0; 688 | 689 | memset(&pages_to_migrate, 0, sizeof(pages_to_migrate)); 690 | while(true) { 691 | struct pid_pages *p; 692 | struct slot_list *s; 693 | int idx, nb_attempts = 0, nb_done = 0; 694 | 695 | if(kthread_should_stop()) { 696 | printk("Exiting periodic migrations"); 697 | return 0; 698 | } 699 | 700 | spin_lock(&lock); 701 | for(idx = CONTENTION_LEVELS - 1; idx > PROBLEMATIC_CONTENTION; idx--) { 702 | list_for_each_entry(s, &heated_pages[idx], contention_list) { 703 | int nb_heated_pages = 0; 704 | { 705 | struct slot_list *_s; 706 | size_t bucket = pfn_to_bucket(page_to_pfn(s->page)); 707 | //printk("\tWill migrate pfn %lu bucket %lu heat %d pfn heat %d\n", page_to_pfn(s->page), bucket, atomic_read(&heatmap[bucket]), s->heat); 708 | list_for_each_entry(_s, &cache_bins[bucket], bin_list) { 709 | if(_s->heat) 710 | nb_heated_pages++; 711 | //printk("\t\tpfn %lu heat %d\n", page_to_pfn(_s->page), _s->heat); 712 | } 713 | } 714 | if(nb_heated_pages > 1) { 715 | struct slot_list *_s; 716 | size_t bucket = pfn_to_bucket(page_to_pfn(s->page)); 717 | printk("\tWill migrate %lx pfn %lu bucket %lu heat %d pfn heat %d\n", s->virt_addr, page_to_pfn(s->page), bucket, atomic_read(&heatmap[bucket]), s->heat); 718 | list_for_each_entry(_s, &cache_bins[bucket], bin_list) { 719 | if(_s->heat) 720 | nb_heated_pages++; 721 | printk("\t\tpfn %lu heat %d\n", page_to_pfn(_s->page), _s->heat); 722 | } 723 | insert_page_in_container(&pages_to_migrate, s->used_by, s->virt_addr, s->page); 724 | //list_del(&s->contention_list); // page will be freed 725 | nb_attempts++; 726 | if(nb_attempts >= MAX_MIGRATED_PAGES) 727 | goto end; 728 | } 729 | } 730 | } 731 | end: 732 | spin_unlock(&lock); 733 | 734 | for(p = pages_to_migrate.pids; p; p = p->next) { 735 | nb_done = s_migrate_pages(p->tgid, p->nb_pages, p->addresses, p->pages); 736 | } 737 | 738 | 739 | printk("[Periodic Migrations] Looked at %d pages, %d succeeded\n", nb_attempts, nb_done); 740 | 741 | clean_container(&pages_to_migrate); 742 | msleep_interruptible(1000); 743 | } 744 | return 0; 745 | } 746 | 747 | 748 | 749 | /* 750 | * Init and cleanup 751 | */ 752 | int init_module(void) 753 | { 754 | size_t config, cpu, ncpus = num_online_cpus(); 755 | static struct perf_event_attr wd_hw_attr = { 756 | .type = PERF_TYPE_RAW, 757 | .size = sizeof(struct perf_event_attr), 758 | .pinned = 0, 759 | .disabled = 1, 760 | .precise_ip = 2, 761 | .sample_id_all = 1, 762 | .exclude_kernel = 1, 763 | .exclude_guest = 1, 764 | .exclude_hv = 0, 765 | .exclude_user =0, 766 | .sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_WEIGHT | PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR, 767 | }; 768 | 769 | /* Clear structures */ 770 | printbuf = vmalloc(HEAT_PRINT_SIZE); 771 | memset(heatmap, 0, sizeof(heatmap)); 772 | memset(&ring, 0, sizeof(ring)); 773 | 774 | /* Create all the metadata for the hugepages */ 775 | build_page_list(); 776 | 777 | /* Launch perf events */ 778 | events = vmalloc(ncpus * ARRAY_SIZE(configs) * sizeof(*events)); 779 | printk(KERN_INFO "Creating %lu events - %lu configs %lu cpus\n", ARRAY_SIZE(configs) * ncpus, ARRAY_SIZE(configs), ncpus); 780 | for(config = 0; ENABLE_SAMPLING && (config < ARRAY_SIZE(configs)); config++) { 781 | for(cpu = 0; cpu < ncpus; cpu++) { 782 | size_t idx = config * ncpus + cpu; 783 | wd_hw_attr.config = configs[config]; 784 | if(configs[config] == STORE_ALL) 785 | wd_hw_attr.sample_period = 5003; 786 | else 787 | wd_hw_attr.sample_period = 5003; 788 | events[idx] = perf_event_create_kernel_counter(&wd_hw_attr, cpu, NULL, pebs_sample, NULL); 789 | if(IS_ERR(events[idx])) { 790 | printk(KERN_INFO "Could not create event %lu on cpu %lu\n", configs[config], cpu); 791 | return -1; 792 | } 793 | perf_event_enable(events[idx]); 794 | printk("Enab\n"); 795 | } 796 | } 797 | 798 | /* Tell the kernel to use our dequeue/enqueue functions for smart alloc/freeing of huge pages */ 799 | set_dequeue_hook(minimize_conflicts); 800 | set_enqueue_hook(enqueue_freed_page); 801 | 802 | /* Run periodic migrations */ 803 | kthread = kthread_run(periodic_migrations, NULL, "kthread-periodic-migrations"); 804 | get_task_struct(kthread); 805 | 806 | return 0; 807 | } 808 | 809 | void cleanup_module(void) 810 | { 811 | size_t cpu, config, ncpus = num_online_cpus(); 812 | 813 | /* Tell the kernel to stop using our dequeue/enqueue functions */ 814 | set_dequeue_hook(NULL); 815 | set_enqueue_hook(NULL); 816 | 817 | /* Stop periodic migrations */ 818 | kthread_stop(kthread); 819 | put_task_struct(kthread); 820 | 821 | /* Print some debug info */ 822 | printk(KERN_INFO "Goodbye world. Total allocated pages %d\n", total_allocated_pages); 823 | printk(KERN_INFO "Goodbye world. Total samples %llu found %llu\n", total_samples, total_samples_found); 824 | for(config = 0; ENABLE_SAMPLING && (config < ARRAY_SIZE(configs)); config++) { 825 | for(cpu = 0; cpu < ncpus; cpu++) { 826 | size_t idx = config * ncpus + cpu; 827 | perf_event_disable(events[idx]); 828 | perf_event_release_kernel(events[idx]); 829 | } 830 | } 831 | check_list(); 832 | vfree(printbuf); 833 | } 834 | 835 | module_param(max_conflicts, int, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); 836 | MODULE_PARM_DESC(max_conflicts, "Set to 1 to maximize conflicts"); 837 | MODULE_LICENSE("GPL"); 838 | --------------------------------------------------------------------------------