├── 0001-Huge-page-allocation-hook.patch
├── 0002-Add-kernel-hooks.patch
├── Makefile
├── README.md
└── jc.c


/0001-Huge-page-allocation-hook.patch:
--------------------------------------------------------------------------------
 1 | From ff726a38bac8d0cc7a1c2934aaedb00666c204f3 Mon Sep 17 00:00:00 2001
 2 | From: Baptiste Lepers <baptiste.lepers@gmail.com>
 3 | Date: Wed, 25 May 2022 06:51:45 +0000
 4 | Subject: [PATCH 1/1] Huge page allocation hook
 5 | 
 6 | ---
 7 |  include/linux/hugetlb.h  |  4 ++++
 8 |  mm/hugetlb.c             | 10 ++++++++++
 9 |  tools/perf/Makefile.perf |  3 ++-
10 |  3 files changed, 16 insertions(+), 1 deletion(-)
11 | 
12 | diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
13 | index 1faebe1cd0ed..dae593460e2c 100644
14 | --- a/include/linux/hugetlb.h
15 | +++ b/include/linux/hugetlb.h
16 | @@ -1103,4 +1103,8 @@ bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr);
17 |  #define flush_hugetlb_tlb_range(vma, addr, end)	flush_tlb_range(vma, addr, end)
18 |  #endif
19 |  
20 | +
21 | +typedef struct page *(*dequeue_hook_t)(struct hstate *h, int nid);
22 | +void set_dequeue_hook(dequeue_hook_t hook);
23 | +
24 |  #endif /* _LINUX_HUGETLB_H */
25 | diff --git a/mm/hugetlb.c b/mm/hugetlb.c
26 | index 95dc7b83381f..16f06224a9b9 100644
27 | --- a/mm/hugetlb.c
28 | +++ b/mm/hugetlb.c
29 | @@ -1080,11 +1080,21 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
30 |  	SetHPageFreed(page);
31 |  }
32 |  
33 | +static dequeue_hook_t dequeue_hook = NULL;
34 | +void set_dequeue_hook(dequeue_hook_t hook)
35 | +{
36 | +   dequeue_hook = hook;
37 | +}
38 | +EXPORT_SYMBOL(set_dequeue_hook);
39 | +
40 |  static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
41 |  {
42 |  	struct page *page;
43 |  	bool pin = !!(current->flags & PF_MEMALLOC_PIN);
44 |  
45 | +   if(dequeue_hook)
46 | +      return dequeue_hook(h, nid);
47 | +
48 |  	lockdep_assert_held(&hugetlb_lock);
49 |  	list_for_each_entry(page, &h->hugepage_freelists[nid], lru) {
50 |  		if (pin && !is_pinnable_page(page))
51 | diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
52 | index e04313c4d840..cd3641e15343 100644
53 | --- a/tools/perf/Makefile.perf
54 | +++ b/tools/perf/Makefile.perf
55 | @@ -51,7 +51,8 @@ include ../scripts/utilities.mak
56 |  # Define GTK2 if you want GTK+ GUI support.
57 |  #
58 |  # Define NO_DEMANGLE if you do not want C++ symbol demangling.
59 | -#
60 | +# 
61 | +CFLAGS += -DNO_DEMANGLE
62 |  # Define NO_LIBELF if you do not want libelf dependency (e.g. cross-builds)
63 |  #
64 |  # Define NO_LIBUNWIND if you do not want libunwind dependency for dwarf
65 | -- 
66 | 2.25.1
67 | 
68 | 


--------------------------------------------------------------------------------
/0002-Add-kernel-hooks.patch:
--------------------------------------------------------------------------------
  1 | From deb09ca95fe3067a67595a1e4fe56a6f6f7da6d1 Mon Sep 17 00:00:00 2001
  2 | From: Baptiste Lepers <baptiste.lepers@gmail.com>
  3 | Date: Wed, 27 Jul 2022 00:58:32 +0000
  4 | Subject: [PATCH 2/2] Add kernel hooks
  5 | 
  6 | ---
  7 |  include/linux/hugetlb.h |  8 ++++++++
  8 |  kernel/events/core.c    |  2 ++
  9 |  mm/hugetlb.c            | 27 +++++++++++++++++++++++++++
 10 |  mm/migrate.c            |  1 +
 11 |  mm/vmscan.c             |  1 +
 12 |  5 files changed, 39 insertions(+)
 13 | 
 14 | diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
 15 | index dae593460e2c..298f99550203 100644
 16 | --- a/include/linux/hugetlb.h
 17 | +++ b/include/linux/hugetlb.h
 18 | @@ -115,6 +115,8 @@ extern struct resv_map *resv_map_alloc(void);
 19 |  void resv_map_release(struct kref *ref);
 20 |  
 21 |  extern spinlock_t hugetlb_lock;
 22 | +spinlock_t *get_hugetlb_lock(void);
 23 | +
 24 |  extern int hugetlb_max_hstate __read_mostly;
 25 |  #define for_each_hstate(h) \
 26 |  	for ((h) = hstates; (h) < &hstates[hugetlb_max_hstate]; (h)++)
 27 | @@ -1107,4 +1109,10 @@ bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr);
 28 |  typedef struct page *(*dequeue_hook_t)(struct hstate *h, int nid);
 29 |  void set_dequeue_hook(dequeue_hook_t hook);
 30 |  
 31 | +typedef struct page *(*enqueue_hook_t)(struct hstate *h, struct page *page);
 32 | +void set_enqueue_hook(enqueue_hook_t hook);
 33 | +
 34 | +int get_max_hstates(void);
 35 | +struct hstate *get_hstates(void);
 36 | +
 37 |  #endif /* _LINUX_HUGETLB_H */
 38 | diff --git a/kernel/events/core.c b/kernel/events/core.c
 39 | index 0c000cb01eeb..a2d96352f59d 100644
 40 | --- a/kernel/events/core.c
 41 | +++ b/kernel/events/core.c
 42 | @@ -7441,6 +7441,8 @@ void perf_prepare_sample(struct perf_event_header *header,
 43 |  	 */
 44 |  	WARN_ON_ONCE(header->size & 7);
 45 |  }
 46 | +EXPORT_SYMBOL_GPL(perf_prepare_sample);
 47 | +
 48 |  
 49 |  static __always_inline int
 50 |  __perf_event_output(struct perf_event *event,
 51 | diff --git a/mm/hugetlb.c b/mm/hugetlb.c
 52 | index 16f06224a9b9..c7cacf19a769 100644
 53 | --- a/mm/hugetlb.c
 54 | +++ b/mm/hugetlb.c
 55 | @@ -72,6 +72,10 @@ static bool __initdata parsed_default_hugepagesz;
 56 |   * free_huge_pages, and surplus_huge_pages.
 57 |   */
 58 |  DEFINE_SPINLOCK(hugetlb_lock);
 59 | +spinlock_t *get_hugetlb_lock(void) {
 60 | +   return &hugetlb_lock;
 61 | +}
 62 | +EXPORT_SYMBOL(get_hugetlb_lock);
 63 |  
 64 |  /*
 65 |   * Serializes faults on the same logical page.  This is used to
 66 | @@ -1067,10 +1071,22 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
 67 |  	return false;
 68 |  }
 69 |  
 70 | +static enqueue_hook_t enqueue_hook = NULL;
 71 | +void set_enqueue_hook(enqueue_hook_t hook)
 72 | +{
 73 | +   enqueue_hook = hook;
 74 | +}
 75 | +EXPORT_SYMBOL(set_enqueue_hook);
 76 | +
 77 |  static void enqueue_huge_page(struct hstate *h, struct page *page)
 78 |  {
 79 |  	int nid = page_to_nid(page);
 80 |  
 81 | +   if(enqueue_hook) {
 82 | +      enqueue_hook(h, page);
 83 | +      return;
 84 | +   }
 85 | +
 86 |  	lockdep_assert_held(&hugetlb_lock);
 87 |  	VM_BUG_ON_PAGE(page_count(page), page);
 88 |  
 89 | @@ -1078,6 +1094,7 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
 90 |  	h->free_huge_pages++;
 91 |  	h->free_huge_pages_node[nid]++;
 92 |  	SetHPageFreed(page);
 93 | +
 94 |  }
 95 |  
 96 |  static dequeue_hook_t dequeue_hook = NULL;
 97 | @@ -1087,6 +1104,16 @@ void set_dequeue_hook(dequeue_hook_t hook)
 98 |  }
 99 |  EXPORT_SYMBOL(set_dequeue_hook);
100 |  
101 | +int get_max_hstates(void) {
102 | +   return hugetlb_max_hstate;
103 | +}
104 | +EXPORT_SYMBOL(get_max_hstates);
105 | +
106 | +struct hstate *get_hstates(void) {
107 | +   return &hstates[0];
108 | +}
109 | +EXPORT_SYMBOL(get_hstates);
110 | +
111 |  static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
112 |  {
113 |  	struct page *page;
114 | diff --git a/mm/migrate.c b/mm/migrate.c
115 | index a6a7743ee98f..eb80305e5fc7 100644
116 | --- a/mm/migrate.c
117 | +++ b/mm/migrate.c
118 | @@ -1601,6 +1601,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
119 |  
120 |  	return rc;
121 |  }
122 | +EXPORT_SYMBOL(migrate_pages);
123 |  
124 |  struct page *alloc_migration_target(struct page *page, unsigned long private)
125 |  {
126 | diff --git a/mm/vmscan.c b/mm/vmscan.c
127 | index 74296c2d1fed..69ae1cda8dfb 100644
128 | --- a/mm/vmscan.c
129 | +++ b/mm/vmscan.c
130 | @@ -2107,6 +2107,7 @@ int isolate_lru_page(struct page *page)
131 |  
132 |  	return ret;
133 |  }
134 | +EXPORT_SYMBOL(isolate_lru_page);
135 |  
136 |  /*
137 |   * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
138 | -- 
139 | 2.25.1
140 | 
141 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | obj-m += jc.o
2 | 
3 | all:
4 | 	make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules
5 | 
6 | clean:
7 | 	make -C /lib/modules/$(shell uname -r)/build M=$(PWD) clean
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Johnny Cache: the End of DRAM Cache Conflicts (in Tiered Main Memory Systems)
 2 | 
 3 | Paper: [https://www.usenix.org/conference/osdi23/presentation/lepers](https://www.usenix.org/conference/osdi23/presentation/lepers)
 4 | 
 5 | ## Usage
 6 | Patch your kernel using the provided patches.
 7 | 
 8 | Edit jc.c to configure JC (see comment on top of the file). I recommend to first try the module without the dynamic policy (default setting).
 9 | 
10 | JC only handles 2MB pages. To force an application to allocate huge pages, we use TCMALLOC ([https://github.com/gperftools/gperftools](https://github.com/gperftools/gperftools)).
11 | 
12 | ```bash
13 | # Reserve a few large pages
14 | sudo sh -c "echo 174080 > /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages" # 340GB of large pages
15 | sudo mount -t hugetlbfs none /mnt/hugetlbfs
16 | sudo chmod 777 /mnt/hugetlbfs/
17 | 
18 | # Insert the module and launch BC with 20 threads
19 | sudo dmesg -c; # clear dmesg
20 | make; # READ the comment of jc.c BEFORE compiling!
21 | sudo insmod jc.ko;
22 | 
23 | numactl --cpunodebind 0 env OMP_NUM_THREADS=20 TCMALLOC_MEMFS_MALLOC_PATH=/mnt/hugetlbfs/ LD_PRELOAD=~/gperftools/.libs/libtcmalloc.so ~/hemem/apps/gapbs/bc -u 29;
24 | 
25 | sudo rmmod jc;
26 | sudo dmesg -c; # print stats
27 | ```
28 | 
29 | 


--------------------------------------------------------------------------------
/jc.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *  Johnny Cache
  3 |  *
  4 |  *  Logic of the code:
  5 |  *
  6 |  *
  7 |  *  The static policy is simple, it uses 3 functions:
  8 |  *   build_page_list(); 							// Builds the list of available pages
  9 |  *   set_dequeue_hook(minimize_conflicts);	// Called when allocating a page
 10 |  *   set_enqueue_hook(enqueue_freed_page);   // Called when freeing a page
 11 |  *
 12 |  *   minimize_conflict calls reserve_page that increases the heat of a bucket (atomic_add...)
 13 |  *   enqueue_freed_page calls decreases the heat of the bucket of the freed page
 14 |  *
 15 |  *   Both functions call move_pages_contention() to maintain the list of pages ordered by heat
 16 |  *
 17 |  *  None of these functions are efficient, but they do the job.
 18 |  *
 19 |  *
 20 |  *
 21 |  *  For the dynamic policy, we tried many implementations. None really worked (we have a paragraph
 22 |  *  discussing that in the paper). The current implementation is the following:
 23 |  *		ENABLE_SAMPLING and ENABLE_PERIODIC_MIGRATIONS need to be set to 1
 24 |  *
 25 |  *    In init_module, we set up the sampling. Whenever a memory access is found, pebs_sample is called.
 26 |  *    pebs_sample calls add_sample that increases the heat of pages & cache slots.
 27 |  *
 28 |  *    Currently the implementation relies on a ring buffer that stores the last N samples. When a page
 29 |  *    is sampled, its heat increases and its address is placed in the ring. While doing so, the sampled
 30 |  *    page may replace another page, whose heat will be decreased. So basically we maintain a "moving
 31 |  *    average" of the heat of pages, over the last N samples.
 32 |  *
 33 |  *  Periodically, a timer is triggered. It calls periodic_migrations() that migrates a page in case of conflict.
 34 |  *
 35 |  *
 36 |  *  AGAIN, keep in mind that the dynamic migrations DO NOT WORK WELL. As we describe in the paper, configuring
 37 |  *  dynamic migrations is very finicky. You'll need to adjust the sampling rate, etc. to get it to work.
 38 |  *
 39 |  *
 40 |  *
 41 |  * /!\ IMPORTANT /!\
 42 |  *	Don't forget to set CACHE_SIZE to the size of your DRAM cache.
 43 |  *	Experiments were done on a machine with 128GB of RAM (64GB per NUMA node), so our cache is 48GB (per NUMA node).
 44 |  *
 45 |  */
 46 | #include <linux/module.h>	/* Needed by all modules */
 47 | #include <linux/moduleparam.h>
 48 | #include <linux/kernel.h>	/* Needed for KERN_INFO */
 49 | #include <linux/mm.h>
 50 | #include <linux/rmap.h>
 51 | #include <linux/migrate.h>
 52 | #include <linux/hugetlb.h>
 53 | #include <linux/perf_event.h>
 54 | #include <asm-generic/io.h>
 55 | #include <linux/sort.h>
 56 | #include <linux/delay.h>
 57 | 
 58 | uint64_t total_samples = 0;
 59 | uint64_t total_samples_found = 0;
 60 | uint64_t *pfns = NULL;
 61 | size_t pfn_idx = 0;
 62 | 
 63 | #define ENABLE_SAMPLING 0 /* Activate dynamic conflict avoidance? */
 64 | #define ENABLE_PERIODIC_MIGRATIONS 0 /* Should also be 1 for the dynamic policy to work */
 65 | 
 66 | 
 67 | /*
 68 |  * Some helper functions to manipulate pages & bitmasks, copy/pasted from the kernel.
 69 |  */
 70 | /* Return a pointer to the bitmap storing bits affecting a block of pages */
 71 | static inline unsigned long *get_pageblock_bitmap(const struct page *page,
 72 | 							unsigned long pfn)
 73 | {
 74 | #ifdef CONFIG_SPARSEMEM
 75 | 	return section_to_usemap(__pfn_to_section(pfn));
 76 | #else
 77 | 	return page_zone(page)->pageblock_flags;
 78 | #endif /* CONFIG_SPARSEMEM */
 79 | }
 80 | 
 81 | static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn)
 82 | {
 83 | #ifdef CONFIG_SPARSEMEM
 84 | 	pfn &= (PAGES_PER_SECTION-1);
 85 | #else
 86 | 	pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
 87 | #endif /* CONFIG_SPARSEMEM */
 88 | 	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
 89 | }
 90 | 
 91 | static __always_inline
 92 | unsigned long __get_pfnblock_flags_mask(const struct page *page,
 93 | 					unsigned long pfn,
 94 | 					unsigned long mask)
 95 | {
 96 | 	unsigned long *bitmap;
 97 | 	unsigned long bitidx, word_bitidx;
 98 | 	unsigned long word;
 99 | 
100 | 	bitmap = get_pageblock_bitmap(page, pfn);
101 | 	bitidx = pfn_to_bitidx(page, pfn);
102 | 	word_bitidx = bitidx / BITS_PER_LONG;
103 | 	bitidx &= (BITS_PER_LONG-1);
104 | 
105 | 	word = bitmap[word_bitidx];
106 | 	return (word >> bitidx) & mask;
107 | }
108 | 
109 | unsigned long get_pfnblock_flags_mask(const struct page *page,
110 | 					unsigned long pfn, unsigned long mask)
111 | {
112 | 	return __get_pfnblock_flags_mask(page, pfn, mask);
113 | }
114 | 
115 | struct migration_target_control {
116 |    pid_t pid;
117 | };
118 | 
119 | 
120 | /*
121 |  * Actual code of JC
122 |  */
123 | static int max_conflicts = 0; // module parameter, maximize conflicts or minize them?
124 | #define CACHE_SIZE (48*512) // DRAM is 64GB -> cache is 48GB on our machine -> 48*512 2MB pages
125 | #define RING_SIZE (1000000) // Compute stats on the last XX memory access samples
126 | #define CONTENTION_LEVELS 20 // Number of contention levels (to be in level N, a page must have been accessed heat_to_contention_level(N) times)
127 | #define PROBLEMATIC_CONTENTION 10 // If a page has been accessed more than that, it is hot
128 | #define MAX_MIGRATED_PAGES 100 // Max number of migrations per period
129 | 
130 | #define LARGE_PAGE_SIZE (2LU*1024*1024) // 2MB
131 | #define pfn_to_large_pfn(pfn) ((pfn)/(512)) // 512 4KB pages in a 2MB page
132 | #define pfn_to_bucket(pfn) (pfn_to_large_pfn(pfn) % CACHE_SIZE)
133 | 
134 | #define HEAT_PRINT_SIZE 2048
135 | static char *printbuf;
136 | static int total_allocated_pages = 0;
137 | 
138 | /* How hot is a cache region? */
139 | static atomic_t heatmap[CACHE_SIZE];
140 | 
141 | /* Ring buffer of samples */
142 | static struct ring {
143 |    atomic_t idx;
144 |    struct sample {
145 |       int weight;
146 |       u64 pfn;
147 |    } samples[RING_SIZE];
148 | } ring;
149 | 
150 | /* Metadata to help page allocations */
151 | struct slot_list {
152 |    size_t bucket;
153 |    struct page *page;
154 |    int heat;
155 |    size_t used_by; // tgid
156 |    size_t virt_addr;
157 |    struct list_head contention_list;
158 |    struct list_head bin_list;
159 | };
160 | static struct list_head unused_pages[CONTENTION_LEVELS]; // heat -> pages
161 | static struct list_head allocated_pages[CONTENTION_LEVELS]; // heat -> pages
162 | static struct list_head heated_pages[CONTENTION_LEVELS]; // heat -> pages
163 | static struct list_head cache_bins[CACHE_SIZE]; // cache slot -> pages
164 | static spinlock_t lock; // list_heads are not thread safe...
165 | 
166 | /* Metadata to help page migration */
167 | struct pid_pages {
168 |    struct pid_pages *next;
169 |    int nb_pages;
170 |    int nb_max_pages;
171 |    pid_t tgid;
172 |    unsigned long *addresses;
173 |    struct page **pages;
174 | };
175 | static struct pages_container {
176 |    int nb_pid;
177 |    struct pid_pages *pids;
178 | } pages_to_migrate;
179 | 
180 | 
181 | /* PEBS configuration */
182 | #define PEBS_SAMPLE_TYPE PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR
183 | #define PMEM_READ 0x80d1
184 | #define DRAM_READ 0x20d1
185 | #define STORE_ALL 0x82d0
186 | static struct perf_event **events;
187 | static size_t configs[] = { DRAM_READ, PMEM_READ, STORE_ALL };
188 | //static size_t configs[] = { PMEM_READ };
189 | 
190 | /* Timer */
191 | struct task_struct *kthread;
192 | 
193 | 
194 | 
195 | 
196 | 
197 | /*
198 |  * Maintain heatmap
199 |  * Maintain list of allocated and free pages at the correct contention level.
200 |  * The current implementation maintains a ring buffer of samples, and updates the heatmap.
201 |  *
202 |  * E.g.,
203 |  *  ring = [ pageX, pageY (current idx), pageZ ]
204 |  *  When receiving a new sample, say on pageW, then
205 |  *  ring = [ pageX, pageY, pageW (current idx) ]
206 |  *  and the heat of pageZ is decreased and the heat of pageW is increased.
207 |  *  The functions also update the heat of the cache slots.
208 |  */
209 | static size_t heat_to_contention_level(int heat) {
210 | 	/* Various definitions of contention */
211 |    size_t idx = 0;
212 |    //while(heat) {
213 |    //heat /= 2;
214 |    //idx++;
215 |    //}
216 |    idx = heat / 50;
217 |    if(idx >= CONTENTION_LEVELS)
218 |       idx = CONTENTION_LEVELS - 1;
219 |    return idx;
220 | }
221 | 
222 | /* Make sure all pages are in the correct contention list so that we can choose one from the lowest */
223 | static void move_pages_contention(size_t bucket, int heat) {
224 |    struct slot_list *s;
225 |    size_t idx = heat_to_contention_level(heat);
226 | 
227 |    spin_lock(&lock);
228 |    list_for_each_entry(s, &cache_bins[bucket], bin_list) {
229 |       if(s->used_by && s->heat)
230 |          list_move(&s->contention_list, &heated_pages[idx]);
231 |       else if(s->used_by)
232 |          list_move(&s->contention_list, &allocated_pages[idx]);
233 |       else
234 |          list_move(&s->contention_list, &unused_pages[idx]);
235 |    }
236 |    spin_unlock(&lock);
237 | }
238 | 
239 | /* add_sample is a generic function to increaset the head of a page */
240 | static void add_sample(u64 pfn, int weight) {
241 |    size_t bucket = pfn_to_bucket(pfn);
242 |    int idx = atomic_inc_return(&ring.idx) - 1;
243 | 
244 |    //return;
245 | 
246 |    if(idx >= RING_SIZE) {
247 |       int old_weight = ring.samples[idx % RING_SIZE].weight;
248 |       int old_pfn = ring.samples[idx % RING_SIZE].pfn;
249 |       size_t old_bucket = pfn_to_bucket(old_pfn);
250 |       atomic_sub(old_weight, &heatmap[old_bucket]);
251 |       move_pages_contention(old_bucket, atomic_read(&heatmap[old_bucket]));
252 |    }
253 | 
254 |    ring.samples[idx % RING_SIZE].pfn = pfn;
255 |    ring.samples[idx % RING_SIZE].weight = weight;
256 |    atomic_add(weight, &heatmap[bucket]);
257 |    move_pages_contention(bucket, atomic_read(&heatmap[bucket]));
258 | 
259 |    {
260 |       struct slot_list *s, *found = NULL;
261 |       size_t bucket = pfn_to_bucket(pfn);
262 |       spin_lock(&lock);
263 |       list_for_each_entry(s, &cache_bins[bucket], bin_list) {
264 |          if(pfn_to_large_pfn(page_to_pfn(s->page)) == pfn_to_large_pfn(pfn)) {
265 |             found = s;
266 |             found->heat += weight;
267 |             list_move(&found->contention_list, &heated_pages[heat_to_contention_level(atomic_read(&heatmap[bucket]))]);
268 |             break;
269 |          }
270 |       }
271 |       spin_unlock(&lock);
272 |       if(found) {
273 |          total_samples_found++;
274 |       } else if(total_samples % 10000 == 0) {
275 |          printk("Didn't find pfn %lu at GB %lu\n", (long unsigned)pfn, (long unsigned)pfn*4096/1024/1024/1024/100);
276 |       }
277 | 
278 |       total_samples++;
279 |    }
280 | }
281 | 
282 | /*
283 |  * At initialization time, add a new discovered 2MB page
284 |  */
285 | static int add_new_page(struct page *page) {
286 |    size_t bucket = pfn_to_bucket(page_to_pfn(page));
287 |    size_t idx = heat_to_contention_level(atomic_read(&heatmap[bucket]));
288 |    struct slot_list *s = kmalloc(sizeof(*s), GFP_KERNEL);
289 |    if(!s) {
290 |       printk(KERN_INFO "Fail to allocate all the pages!\n");
291 |       return -1;
292 |    }
293 | 
294 |    s->bucket = bucket;
295 |    s->page = page;
296 |    s->used_by = 0;
297 |    s->heat = 0;
298 |    INIT_LIST_HEAD(&s->bin_list);
299 |    INIT_LIST_HEAD(&s->contention_list);
300 |    list_add(&s->bin_list, &cache_bins[bucket]);
301 |    list_add(&s->contention_list, &unused_pages[idx]);
302 |    return 0;
303 | }
304 | 
305 | /*
306 |  * After munmap, re-add the page in the free list
307 |  */
308 | static int add_freed_page(struct page *page) {
309 |    struct slot_list *s;
310 |    size_t bucket = pfn_to_bucket(page_to_pfn(page));
311 | 
312 |    atomic_sub(50, &heatmap[bucket]);
313 |    move_pages_contention(bucket, atomic_read(&heatmap[bucket]));
314 | 
315 |    {
316 |       int heat = atomic_read(&heatmap[bucket]);
317 |       int found = 0;
318 |       size_t idx = heat_to_contention_level(heat);
319 | 
320 |       spin_lock(&lock);
321 |       list_for_each_entry(s, &cache_bins[bucket], bin_list) {
322 |          if(s->page == page) {
323 |             s->used_by = 0;
324 |             list_move(&s->contention_list, &unused_pages[idx]);
325 |             found = 1;
326 |             //printk("Freeing page %lu %lx\n", pfn_to_large_pfn(page_to_pfn(page)), s->virt_addr);
327 |             break;
328 |          }
329 |       }
330 |       spin_unlock(&lock);
331 | 
332 |       if(!found)
333 |          printk("Didn't find page %lu\n", pfn_to_large_pfn(page_to_pfn(page)));
334 |    }
335 |    return 0;
336 | }
337 | 
338 | /*
339 |  * Mark a 2MB page as used (done just after mmap)
340 |  */
341 | static void reserve_page(struct hstate *h, int nid, pid_t pid, struct slot_list *s, int contention_idx, struct vm_area_struct *vma, unsigned long addr)
342 | {
343 |    size_t bucket = pfn_to_bucket(page_to_pfn(s->page));
344 | 
345 |    spin_lock(&lock);
346 |    s->used_by = pid;
347 |    s->heat = 0;
348 |    s->virt_addr = addr;
349 |    list_move(&s->contention_list, &allocated_pages[contention_idx]);
350 |    spin_unlock(&lock);
351 | 
352 |    atomic_add(50, &heatmap[bucket]);
353 |    move_pages_contention(bucket, atomic_read(&heatmap[bucket]));
354 | 
355 |    list_move(&s->page->lru, &h->hugepage_activelist);
356 |    set_page_count(s->page, 1);
357 |    ClearHPageFreed(s->page);
358 |    h->free_huge_pages--;
359 |    h->free_huge_pages_node[nid]--;
360 | 
361 |    total_allocated_pages++;
362 |    //printk("Allocating page %lu heat %d here %lu %lx\n", pfn_to_large_pfn(page_to_pfn(s->page)), atomic_read(&heatmap[bucket]), addr, addr);
363 | }
364 | 
365 | struct hstate *size_to_hstate(unsigned long size)
366 | {
367 | 	struct hstate *h;
368 |    struct hstate *hstates = get_hstates();
369 |    int hugetlb_max_hstate = get_max_hstates();
370 | 
371 | 	for_each_hstate(h) {
372 | 		if (huge_page_size(h) == size)
373 | 			return h;
374 | 	}
375 | 	return NULL;
376 | }
377 | 
378 | /*
379 |  * Function called at initialization time to build the list of 2MB pages
380 |  */
381 | static void build_page_list(void) {
382 |    size_t i, nb_pages = 0;
383 |    int nid = 0; // TODO!
384 |    struct hstate *h;
385 |    struct page *page;
386 |    bool pin = !!(current->flags & PF_MEMALLOC_PIN);
387 |    struct hstate *hstates = get_hstates();
388 |    int hugetlb_max_hstate = get_max_hstates();
389 | 
390 |    for(i = 0; i < CONTENTION_LEVELS; i++)
391 |       INIT_LIST_HEAD(&unused_pages[i]);
392 |    for(i = 0; i < CONTENTION_LEVELS; i++)
393 |       INIT_LIST_HEAD(&allocated_pages[i]);
394 |    for(i = 0; i < CONTENTION_LEVELS; i++)
395 |       INIT_LIST_HEAD(&heated_pages[i]);
396 |    for(i = 0; i < CACHE_SIZE; i++)
397 |       INIT_LIST_HEAD(&cache_bins[i]);
398 | 
399 |    // Pages have 0 refcount here -- but let's not increase it to not interfere with the kernel
400 |    // They are not going to disappear anyway, except if the user reduces the number of
401 |    // hugetlbfs pages...
402 |    for_each_hstate(h) {
403 |       list_for_each_entry(page, &h->hugepage_freelists[nid], lru) {
404 |          if (pin && !is_pinnable_page(page))
405 |             continue;
406 | 
407 |          if (PageHWPoison(page))
408 |             continue;
409 | 
410 |          nb_pages++;
411 |          if(add_new_page(page) != 0)
412 |             break;
413 |       }
414 |    }
415 |    printk("Successfully created a list of %lu pages\n", nb_pages);
416 |    /*{
417 |       int i;
418 |       struct slot_list *s;
419 |       for(i = 0; i < CACHE_SIZE; i++) {
420 |          int nb_entries = 0, nb_allocated = 0;
421 |          list_for_each_entry(s, &cache_bins[i], bin_list) {
422 |             nb_entries++;
423 |             if(s->used_by)
424 |                nb_allocated++;
425 |          }
426 |          printk("CHECK: cache_bins[%d] = %d entries %d allocated %d heat\n", i, nb_entries, nb_allocated, atomic_read(&heatmap[i]));
427 |       }
428 |    }*/
429 | }
430 | 
431 | static __attribute__((unused)) void check_list(void) {
432 |    size_t i;
433 |    struct slot_list *s;
434 |    for(i = 0; i < CONTENTION_LEVELS; i++) {
435 |       int nb_entries = 0;
436 |       list_for_each_entry(s, &unused_pages[i], contention_list) {
437 |          nb_entries++;
438 |       }
439 |       printk("CHECK: contention[%lu] = %d entries\n", i, nb_entries);
440 |    }
441 |    return;
442 |    for(i = 0; i < CACHE_SIZE; i++) {
443 |       int nb_entries = 0, nb_allocated = 0;
444 |       list_for_each_entry(s, &cache_bins[i], bin_list) {
445 |          nb_entries++;
446 |          if(s->used_by)
447 |             nb_allocated++;
448 |       }
449 |       printk("CHECK: cache_bins[%lu] = %d entries %d allocated %d heat\n", i, nb_entries, nb_allocated, atomic_read(&heatmap[i]));
450 |    }
451 | }
452 | 
453 | 
454 | 
455 | 
456 | 
457 | 
458 | 
459 | /*
460 |  * Page choice algorithm
461 |  */
462 | static struct page *_minimize_conflicts(struct hstate *h, int nid, pid_t pid, struct vm_area_struct *vma, unsigned long addr)
463 | {
464 |    size_t i;
465 |    struct page *ret = NULL;
466 |    struct slot_list *s = NULL;
467 |    lockdep_assert_held(&hugetlb_lock);
468 | 
469 |    for(i = 0; i < CONTENTION_LEVELS; i++) {
470 |       //for(i = CONTENTION_LEVELS - 1; i >= 0; i--) {
471 |       s = list_first_entry_or_null(&unused_pages[i], struct slot_list, contention_list);
472 |       /*size_t lowest_pfn = 0;
473 |       list_for_each_entry(sl, &unused_pages[i], contention_list) {
474 |          if(lowest_pfn == 0 || page_to_pfn(sl->page) < lowest_pfn) {
475 |             s = sl;
476 |             lowest_pfn = page_to_pfn(sl->page);
477 |          }
478 |       }*/
479 |       if(s)
480 |          break;
481 |    }
482 | 
483 |    if(s) {
484 |       ret = s->page;
485 |       reserve_page(h, nid, pid, s, i, vma, addr);
486 |    }
487 |    return ret;
488 | }
489 | 
490 | static struct page *minimize_conflicts(struct hstate *h, int nid, struct vm_area_struct *vma, unsigned long addr) {
491 |    return _minimize_conflicts(h, nid, current->tgid, vma, addr);
492 | }
493 | 
494 | static struct page *enqueue_freed_page(struct hstate *h, struct page *page) {
495 |    int nid = page_to_nid(page);
496 | 
497 | 	lockdep_assert_held(&hugetlb_lock);
498 | 	VM_BUG_ON_PAGE(page_count(page), page);
499 | 
500 | 	list_move(&page->lru, &h->hugepage_freelists[nid]);
501 | 	h->free_huge_pages++;
502 | 	h->free_huge_pages_node[nid]++;
503 | 	SetHPageFreed(page);
504 | 
505 |    add_freed_page(page);
506 |    return page;
507 | }
508 | 
509 | 
510 | 
511 | 
512 | 
513 | 
514 | 
515 | 
516 | 
517 | /*
518 |  * PEBS
519 |  */
520 | static u64 perf_virt_to_phys(u64 virt)
521 | {
522 | 	u64 phys_addr = 0;
523 | 
524 | 	if (!virt)
525 | 		return 0;
526 | 
527 | 	if (virt >= TASK_SIZE) {
528 | 		if (virt_addr_valid((void *)(uintptr_t)virt) &&
529 | 		    !(virt >= VMALLOC_START && virt < VMALLOC_END))
530 | 			phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
531 | 	} else {
532 | 		if (current->mm != NULL) {
533 | 			struct page *p;
534 | 			pagefault_disable();
535 | 			if (get_user_page_fast_only(virt, 0, &p)) {
536 | 				phys_addr = (page_to_pfn(p) << PAGE_SHIFT) + virt % PAGE_SIZE;
537 | 				put_page(p);
538 | 			}
539 | 			pagefault_enable();
540 | 		}
541 | 	}
542 | 	return phys_addr;
543 | }
544 | 
545 | static void pebs_sample(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs)
546 | {
547 |    size_t phys = perf_virt_to_phys(data->addr);
548 |    size_t pfn = phys / 4096;
549 |    if(phys && pfn) {}
550 |    add_sample(pfn, (event->attr.config == STORE_ALL)?10:1); // TODO
551 |    //printk("Event %p (config %llx) CPU %u vs %u Tid %u Virt addr: %llx Phys %llx\n", event, event->attr.config, data->cpu_entry.cpu, smp_processor_id(), current->pid, data->addr, perf_virt_to_phys(data->addr));
552 | }
553 | 
554 | 
555 | 
556 | 
557 | 
558 | /*
559 |  * Periodic migrations
560 |  * 1/ Find pages with high contention
561 |  * 2/ Place these pages in a per pid container
562 |  * 3/ Migrate pages per pid using the migrate_page kernel function
563 |  */
564 | struct pid_pages *insert_pid_in_container(struct pages_container *c, pid_t tgid) {
565 |    struct pid_pages *p = kmalloc(sizeof(*p), GFP_KERNEL);
566 |    memset(p, 0, sizeof(*p));
567 |    p->tgid = tgid;
568 |    p->next = c->pids;
569 |    c->pids = p;
570 |    c->nb_pid++;
571 |    return p;
572 | }
573 | 
574 | void insert_page_in_container(struct pages_container *c, pid_t tgid, unsigned long addr, struct page *page) {
575 |    struct pid_pages *p = NULL;
576 |    struct pid_pages *l = c->pids;
577 |    while(l) {
578 |       if(l->tgid == tgid) {
579 |          p = l;
580 |          break;
581 |       }
582 |       l = l->next;
583 |    }
584 | 
585 |    if(!p)
586 |       p = insert_pid_in_container(c, tgid);
587 | 
588 |    if(p->nb_pages >= p->nb_max_pages) {
589 |       if(p->nb_max_pages) {
590 |          p->nb_max_pages *= 2;
591 |       } else {
592 |          p->nb_max_pages = 256;
593 |       }
594 |       p->addresses = krealloc(p->addresses, sizeof(*p->addresses)*p->nb_max_pages, GFP_KERNEL);
595 |       p->pages = krealloc(p->pages, sizeof(*p->pages)*p->nb_max_pages, GFP_KERNEL);
596 |    }
597 |    p->addresses[p->nb_pages] = addr;
598 |    p->pages[p->nb_pages] = page;
599 |    p->nb_pages++;
600 | }
601 | 
602 | 
603 | static void clean_container(struct pages_container *c) {
604 |    struct pid_pages *p, *tmp;
605 |    for(p = c->pids; p;) {
606 |       if(p->pages)
607 |          kfree(p->pages);
608 |       if(p->addresses)
609 |          kfree(p->addresses);
610 |       tmp = p;
611 |       p = p->next;
612 |       kfree(tmp);
613 |    }
614 |    c->pids = 0;
615 |    c->nb_pid = 0;
616 | }
617 | 
618 | 
619 | struct task_struct *_find_task_by_vpid(pid_t vnr)
620 | {
621 |    return pid_task(find_pid_ns(vnr, task_active_pid_ns(current)), PIDTYPE_PID);
622 | }
623 | 
624 | struct page *alloc_migration_target(struct page *old, unsigned long private) {
625 |    struct hstate *h = page_hstate(old);
626 |    struct migration_target_control *mtc = (void*)private;
627 |    return _minimize_conflicts(h, 0, mtc->pid, NULL, (unsigned long)page_address(old));
628 | }
629 | 
630 | /* Equivalent of the move_pages syscall, except it doesn't migrate pages between NUMA nodes but within the same node               */
631 | /* TODO: optimize. We might not need this whole VMA and MM overhead since we already have the struct page...                       */
632 | /*       ... but at least we know that the following code is safe. pages are refcounted so they are not going to be freed under us */
633 | int s_migrate_pages(pid_t pid, unsigned long nr_pages, unsigned long *addresses, struct page **pages) {
634 |    int nb_migrations = 0, i;
635 |    struct task_struct *task;
636 |    struct mm_struct *mm;
637 |    LIST_HEAD(pagelist);
638 |    struct migration_target_control mtc = {
639 |       .pid = pid
640 |    };
641 | 
642 |    rcu_read_lock();
643 | 	task = _find_task_by_vpid(pid);
644 | 	if (!task) {
645 | 		rcu_read_unlock();
646 | 		return -1;
647 | 	}
648 |    rcu_read_unlock();
649 | 
650 | 	get_task_struct(task);
651 |    mm = get_task_mm(task);
652 |    put_task_struct(task);
653 | 
654 |    if (!mm)
655 |       return -2;
656 | 
657 |    mmap_read_lock(mm);
658 |    for(i = 0; i < nr_pages; i++) {
659 |       //struct page *page = pages[i]; // unsafe?
660 |       struct page *page;
661 |       struct vm_area_struct *vma = find_vma(mm, addresses[i]);
662 |       if(!vma) {
663 |          printk("Couldn't find VMA for page %lx\n", addresses[i]);
664 |          continue;
665 |       }
666 | 
667 |       page = follow_page(vma, addresses[i], FOLL_GET | FOLL_DUMP); // increment page_ref_count
668 |       if(!page) // refcount should be == 2 here, 1 because the page is mapped, 1 because follow_page increments it, unless page got freed
669 |          continue;
670 | 
671 |       isolate_huge_page(page, &pagelist); // also increment page_ref_count, refcount == 3
672 |       put_page(page); // decrement the refcount, should be == 2 now
673 | 
674 |       nb_migrations++;
675 |    }
676 |    mmap_read_unlock(mm);
677 | 
678 |    migrate_pages(&pagelist, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, MR_NUMA_MISPLACED, NULL); // will decrement all refcounts
679 | 
680 |    mmput(mm);
681 |    return nb_migrations;
682 | }
683 | 
684 | static int periodic_migrations(void* data)
685 | {
686 |    if(!ENABLE_PERIODIC_MIGRATIONS)
687 |       return 0;
688 | 
689 |    memset(&pages_to_migrate, 0, sizeof(pages_to_migrate));
690 |    while(true) {
691 |       struct pid_pages *p;
692 |       struct slot_list *s;
693 |       int idx, nb_attempts = 0, nb_done = 0;
694 | 
695 |       if(kthread_should_stop()) {
696 |          printk("Exiting periodic migrations");
697 |          return 0;
698 |       }
699 | 
700 |       spin_lock(&lock);
701 |       for(idx = CONTENTION_LEVELS - 1; idx > PROBLEMATIC_CONTENTION; idx--) {
702 |          list_for_each_entry(s, &heated_pages[idx], contention_list) {
703 |             int nb_heated_pages = 0;
704 |             {
705 |                struct slot_list *_s;
706 |                size_t bucket = pfn_to_bucket(page_to_pfn(s->page));
707 |                //printk("\tWill migrate pfn %lu bucket %lu heat %d pfn heat %d\n", page_to_pfn(s->page), bucket, atomic_read(&heatmap[bucket]), s->heat);
708 |                list_for_each_entry(_s, &cache_bins[bucket], bin_list) {
709 |                   if(_s->heat)
710 |                      nb_heated_pages++;
711 |                   //printk("\t\tpfn %lu heat %d\n", page_to_pfn(_s->page), _s->heat);
712 |                }
713 |             }
714 |             if(nb_heated_pages > 1) {
715 |                struct slot_list *_s;
716 |                size_t bucket = pfn_to_bucket(page_to_pfn(s->page));
717 |                printk("\tWill migrate %lx pfn %lu bucket %lu heat %d pfn heat %d\n", s->virt_addr, page_to_pfn(s->page), bucket, atomic_read(&heatmap[bucket]), s->heat);
718 |                list_for_each_entry(_s, &cache_bins[bucket], bin_list) {
719 |                   if(_s->heat)
720 |                      nb_heated_pages++;
721 |                   printk("\t\tpfn %lu heat %d\n", page_to_pfn(_s->page), _s->heat);
722 |                }
723 |                insert_page_in_container(&pages_to_migrate, s->used_by, s->virt_addr, s->page);
724 |                //list_del(&s->contention_list); // page will be freed
725 |                nb_attempts++;
726 |                if(nb_attempts >= MAX_MIGRATED_PAGES)
727 |                   goto end;
728 |             }
729 |          }
730 |       }
731 | end:
732 |       spin_unlock(&lock);
733 | 
734 |       for(p = pages_to_migrate.pids; p; p = p->next) {
735 |          nb_done = s_migrate_pages(p->tgid, p->nb_pages, p->addresses, p->pages);
736 |       }
737 | 
738 | 
739 |       printk("[Periodic Migrations] Looked at %d pages, %d succeeded\n", nb_attempts, nb_done);
740 | 
741 |       clean_container(&pages_to_migrate);
742 |       msleep_interruptible(1000);
743 |    }
744 |    return 0;
745 | }
746 | 
747 | 
748 | 
749 | /*
750 |  * Init and cleanup
751 |  */
752 | int init_module(void)
753 | {
754 |    size_t config, cpu, ncpus = num_online_cpus();
755 |    static struct perf_event_attr wd_hw_attr = {
756 |       .type = PERF_TYPE_RAW,
757 |       .size		= sizeof(struct perf_event_attr),
758 |       .pinned		= 0,
759 |       .disabled	= 1,
760 |       .precise_ip = 2,
761 |       .sample_id_all = 1,
762 |       .exclude_kernel = 1,
763 |       .exclude_guest = 1,
764 |       .exclude_hv = 0,
765 |       .exclude_user =0,
766 |       .sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_WEIGHT | PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR,
767 |    };
768 | 
769 |    /* Clear structures */
770 |    printbuf = vmalloc(HEAT_PRINT_SIZE);
771 |    memset(heatmap, 0, sizeof(heatmap));
772 |    memset(&ring, 0, sizeof(ring));
773 | 
774 |    /* Create all the metadata for the hugepages */
775 |    build_page_list();
776 | 
777 |    /* Launch perf events */
778 |    events = vmalloc(ncpus * ARRAY_SIZE(configs) * sizeof(*events));
779 |    printk(KERN_INFO "Creating %lu events - %lu configs %lu cpus\n", ARRAY_SIZE(configs) * ncpus, ARRAY_SIZE(configs), ncpus);
780 |    for(config = 0; ENABLE_SAMPLING && (config < ARRAY_SIZE(configs)); config++) {
781 |       for(cpu = 0; cpu < ncpus; cpu++) {
782 |          size_t idx = config * ncpus + cpu;
783 |          wd_hw_attr.config = configs[config];
784 |          if(configs[config] == STORE_ALL)
785 |             wd_hw_attr.sample_period = 5003;
786 |          else
787 |             wd_hw_attr.sample_period = 5003;
788 |          events[idx] = perf_event_create_kernel_counter(&wd_hw_attr, cpu, NULL, pebs_sample, NULL);
789 |          if(IS_ERR(events[idx])) {
790 |             printk(KERN_INFO "Could not create event %lu on cpu %lu\n", configs[config], cpu);
791 |             return -1;
792 |          }
793 |          perf_event_enable(events[idx]);
794 |          printk("Enab\n");
795 |       }
796 |    }
797 | 
798 |    /* Tell the kernel to use our dequeue/enqueue functions for smart alloc/freeing of huge pages */
799 |    set_dequeue_hook(minimize_conflicts);
800 |    set_enqueue_hook(enqueue_freed_page);
801 | 
802 |    /* Run periodic migrations */
803 |    kthread = kthread_run(periodic_migrations, NULL, "kthread-periodic-migrations");
804 |    get_task_struct(kthread);
805 | 
806 | 	return 0;
807 | }
808 | 
809 | void cleanup_module(void)
810 | {
811 |    size_t cpu, config, ncpus = num_online_cpus();
812 | 
813 |    /* Tell the kernel to stop using our dequeue/enqueue functions */
814 |    set_dequeue_hook(NULL);
815 |    set_enqueue_hook(NULL);
816 | 
817 |    /* Stop periodic migrations */
818 |    kthread_stop(kthread);
819 |    put_task_struct(kthread);
820 | 
821 |    /* Print some debug info */
822 | 	printk(KERN_INFO "Goodbye world. Total allocated pages %d\n", total_allocated_pages);
823 | 	printk(KERN_INFO "Goodbye world. Total samples %llu found %llu\n", total_samples, total_samples_found);
824 |    for(config = 0; ENABLE_SAMPLING && (config < ARRAY_SIZE(configs)); config++) {
825 |       for(cpu = 0; cpu < ncpus; cpu++) {
826 |          size_t idx = config * ncpus + cpu;
827 |          perf_event_disable(events[idx]);
828 |          perf_event_release_kernel(events[idx]);
829 |       }
830 |    }
831 |    check_list();
832 |    vfree(printbuf);
833 | }
834 | 
835 | module_param(max_conflicts, int, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
836 | MODULE_PARM_DESC(max_conflicts, "Set to 1 to maximize conflicts");
837 | MODULE_LICENSE("GPL");
838 | 


--------------------------------------------------------------------------------